1 @/***************************************************************************** 2 @* 3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 @* 5 @* Licensed under the Apache License, Version 2.0 (the "License"); 6 @* you may not use this file except in compliance with the License. 7 @* You may obtain a copy of the License at: 8 @* 9 @* http://www.apache.org/licenses/LICENSE-2.0 10 @* 11 @* Unless required by applicable law or agreed to in writing, software 12 @* distributed under the License is distributed on an "AS IS" BASIS, 13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 @* See the License for the specific language governing permissions and 15 @* limitations under the License. 16 @* 17 @*****************************************************************************/ 18 @/** 19 @****************************************************************************** 20 @* @file 21 @* ihevc_inter_pred_filters_luma_vert.s 22 @* 23 @* @brief 24 @* contains function definitions for inter prediction interpolation. 25 @* functions are coded using neon intrinsics and can be compiled using 26 27 @* rvct 28 @* 29 @* @author 30 @* parthiban v 31 @* 32 @* @par list of functions: 33 @* 34 @* - ihevc_inter_pred_luma_vert() 35 @* 36 @* @remarks 37 @* none 38 @* 39 @******************************************************************************* 40 @*/ 41 42 @/* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */ 43 @/* include reconstruction */ 44 45 46 47 @/** 48 @******************************************************************************* 49 @* 50 @* @brief 51 @* interprediction luma filter for vertical input 52 @* 53 @* @par description: 54 @* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to 55 @* the elements pointed by 'pu1_src' and writes to the location pointed by 56 @* 'pu1_dst' the output is downshifted by 6 and clipped to 8 bits 57 @* assumptions : the function is optimized considering the fact width is 58 @* multiple of 4 or 8. and height as multiple of 2. 59 @* 60 @* @param[in] pu1_src 61 @* uword8 pointer to the source 62 @* 63 @* @param[out] pu1_dst 64 @* uword8 pointer to the destination 65 @* 66 @* @param[in] src_strd 67 @* integer source stride 68 @* 69 @* @param[in] dst_strd 70 @* integer destination stride 71 @* 72 @* @param[in] pi1_coeff 73 @* word8 pointer to the filter coefficients 74 @* 75 @* @param[in] ht 76 @* integer height of the array 77 @* 78 @* @param[in] wd 79 @* integer width of the array 80 @* 81 @* @returns 82 @* 83 @* @remarks 84 @* none 85 @* 86 @******************************************************************************* 87 @*/ 88 89 @void ihevc_inter_pred_luma_vert ( 90 @ uword8 *pu1_src, 91 @ uword8 *pu1_dst, 92 @ word32 src_strd, 93 @ word32 dst_strd, 94 @ word8 *pi1_coeff, 95 @ word32 ht, 96 @ word32 wd ) 97 98 @**************variables vs registers***************************************** 99 @ r0 => *pu1_src 100 @ r1 => *pu1_dst 101 @ r2 => src_strd 102 @ r6 => dst_strd 103 @ r12 => *pi1_coeff 104 @ r5 => ht 105 @ r3 => wd 106 .text 107 .align 4 108 .syntax unified 109 110 111 112 .globl ihevc_inter_pred_luma_vert_a9q 113 114 .type ihevc_inter_pred_luma_vert_a9q, %function 115 116 ihevc_inter_pred_luma_vert_a9q: 117 118 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 119 120 ldr r12,[sp,#40] @load pi1_coeff 121 mov r6,r3 122 ldr r5,[sp,#48] @load wd 123 vld1.u8 {d0},[r12] @coeff = vld1_s8(pi1_coeff) 124 sub r12,r2,r2,lsl #2 @src_ctrd & pi1_coeff 125 vabs.s8 d0,d0 @vabs_s8(coeff) 126 add r0,r0,r12 @r0->pu1_src r12->pi1_coeff 127 ldr r3,[sp,#44] @load ht 128 subs r7,r3,#0 @r3->ht 129 @ble end_loops @end loop jump 130 vdup.u8 d22,d0[0] @coeffabs_0 = vdup_lane_u8(coeffabs, 0)@ 131 cmp r5,#8 132 vdup.u8 d23,d0[1] @coeffabs_1 = vdup_lane_u8(coeffabs, 1)@ 133 vdup.u8 d24,d0[2] @coeffabs_2 = vdup_lane_u8(coeffabs, 2)@ 134 vdup.u8 d25,d0[3] @coeffabs_3 = vdup_lane_u8(coeffabs, 3)@ 135 vdup.u8 d26,d0[4] @coeffabs_4 = vdup_lane_u8(coeffabs, 4)@ 136 vdup.u8 d27,d0[5] @coeffabs_5 = vdup_lane_u8(coeffabs, 5)@ 137 vdup.u8 d28,d0[6] @coeffabs_6 = vdup_lane_u8(coeffabs, 6)@ 138 vdup.u8 d29,d0[7] @coeffabs_7 = vdup_lane_u8(coeffabs, 7)@ 139 blt core_loop_wd_4 @core loop wd 4 jump 140 str r0, [sp, #-4]! 141 str r1, [sp, #-4]! 142 143 bic r4,r5,#7 @r5 ->wd 144 rsb r9,r4,r6,lsl #2 @r6->dst_strd r5 ->wd 145 rsb r8,r4,r2,lsl #2 @r2->src_strd 146 mov r3, r5, lsr #3 @divide by 8 147 mul r7, r3 @multiply height by width 148 sub r7, #4 @subtract by one for epilog 149 150 prolog: 151 152 and r10, r0, #31 153 add r3,r0,r2 @pu1_src_tmp += src_strd@ 154 vld1.u8 {d1},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 155 vld1.u8 {d0},[r0]! @src_tmp1 = vld1_u8(pu1_src_tmp)@ 156 subs r4,r4,#8 157 vld1.u8 {d2},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 158 vmull.u8 q4,d1,d23 @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@ 159 vld1.u8 {d3},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@ 160 vmlsl.u8 q4,d0,d22 @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@ 161 vld1.u8 {d4},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@ 162 vmlsl.u8 q4,d2,d24 @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@ 163 vld1.u8 {d5},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 164 vmlal.u8 q4,d3,d25 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@ 165 vld1.u8 {d6},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 166 vmlal.u8 q4,d4,d26 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@ 167 vld1.u8 {d7},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@ 168 vmlsl.u8 q4,d5,d27 @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@ 169 vld1.u8 {d16},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@ 170 vmlal.u8 q4,d6,d28 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@ 171 vld1.u8 {d17},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 172 vmlsl.u8 q4,d7,d29 @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@ 173 174 175 vld1.u8 {d18},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 176 vmull.u8 q5,d2,d23 @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@ 177 178 addle r0,r0,r8 179 vmlsl.u8 q5,d1,d22 @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@ 180 181 bicle r4,r5,#7 @r5 ->wd 182 vmlsl.u8 q5,d3,d24 @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@ 183 184 pld [r3] 185 vmlal.u8 q5,d4,d25 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@ 186 pld [r3, r2] 187 vmlal.u8 q5,d5,d26 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@ 188 pld [r3, r2, lsl #1] 189 vmlsl.u8 q5,d6,d27 @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@ 190 191 add r3, r3, r2 192 vmlal.u8 q5,d7,d28 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@ 193 194 pld [r3, r2, lsl #1] 195 vmlsl.u8 q5,d16,d29 @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@ 196 197 add r3,r0,r2 @pu1_src_tmp += src_strd@ 198 vqrshrun.s16 d8,q4,#6 @sto_res = vqmovun_s16(sto_res_tmp)@ 199 200 vld1.u8 {d1},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 201 vmull.u8 q6,d3,d23 202 vld1.u8 {d0},[r0]! @src_tmp1 = vld1_u8(pu1_src_tmp)@ 203 vmlsl.u8 q6,d2,d22 204 vld1.u8 {d2},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 205 vmlsl.u8 q6,d4,d24 206 vmlal.u8 q6,d5,d25 207 vmlal.u8 q6,d6,d26 208 vmlsl.u8 q6,d7,d27 209 vmlal.u8 q6,d16,d28 210 vmlsl.u8 q6,d17,d29 211 add r14,r1,r6 212 vst1.8 {d8},[r1]! @vst1_u8(pu1_dst,sto_res)@ 213 vqrshrun.s16 d10,q5,#6 @sto_res = vqmovun_s16(sto_res_tmp)@ 214 addle r1,r1,r9 215 216 vmull.u8 q7,d4,d23 217 subs r7,r7,#4 218 vmlsl.u8 q7,d3,d22 219 vmlsl.u8 q7,d5,d24 220 vmlal.u8 q7,d6,d25 221 vld1.u8 {d3},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@ 222 vmlal.u8 q7,d7,d26 223 vld1.u8 {d4},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@ 224 vmlsl.u8 q7,d16,d27 225 vld1.u8 {d5},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 226 vmlal.u8 q7,d17,d28 227 vld1.u8 {d6},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 228 vmlsl.u8 q7,d18,d29 229 vld1.u8 {d7},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@ 230 231 vst1.8 {d10},[r14],r6 @vst1_u8(pu1_dst_tmp,sto_res)@ 232 vqrshrun.s16 d12,q6,#6 233 234 235 blt epilog_end @jumps to epilog_end 236 beq epilog @jumps to epilog 237 238 kernel_8: 239 240 subs r4,r4,#8 241 vmull.u8 q4,d1,d23 @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@ 242 243 addle r0,r0,r8 244 vmlsl.u8 q4,d0,d22 @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@ 245 246 bicle r4,r5,#7 @r5 ->wd 247 vmlsl.u8 q4,d2,d24 @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@ 248 249 vld1.u8 {d16},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@ 250 vmlal.u8 q4,d3,d25 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@ 251 252 vld1.u8 {d17},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 253 vmlal.u8 q4,d4,d26 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@ 254 255 vld1.u8 {d18},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 256 vmlsl.u8 q4,d5,d27 @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@ 257 258 vmlal.u8 q4,d6,d28 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@ 259 260 vmlsl.u8 q4,d7,d29 @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@ 261 vst1.8 {d12},[r14],r6 262 263 @ and r11, r0, #31 264 vqrshrun.s16 d14,q7,#6 265 266 add r3,r0,r2 @pu1_src_tmp += src_strd@ 267 vmull.u8 q5,d2,d23 @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@ 268 269 vld1.u8 {d0},[r0]! @src_tmp1 = vld1_u8(pu1_src_tmp)@ 270 vmlsl.u8 q5,d1,d22 @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@ 271 272 vmlsl.u8 q5,d3,d24 @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@ 273 274 vld1.u8 {d1},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 275 vmlal.u8 q5,d4,d25 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@ 276 277 vst1.8 {d14},[r14],r6 278 vmlal.u8 q5,d5,d26 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@ 279 280 add r14,r1,#0 281 vmlsl.u8 q5,d6,d27 @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@ 282 283 add r1, r1, #8 284 vmlal.u8 q5,d7,d28 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@ 285 286 vmlsl.u8 q5,d16,d29 @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@ 287 288 addle r1,r1,r9 289 vqrshrun.s16 d8,q4,#6 @sto_res = vqmovun_s16(sto_res_tmp)@ 290 291 @ cmp r11, r10 292 vmull.u8 q6,d3,d23 293 294 add r10, r3, r2, lsl #3 @ 10*strd - 8+2 295 vmlsl.u8 q6,d2,d22 296 297 add r10, r10, r2 @ 11*strd 298 vmlsl.u8 q6,d4,d24 299 300 vld1.u8 {d2},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 301 vmlal.u8 q6,d5,d25 302 303 vmlal.u8 q6,d6,d26 304 vst1.8 {d8},[r14],r6 @vst1_u8(pu1_dst,sto_res)@ 305 306 pld [r10] @11+ 0 307 vmlsl.u8 q6,d7,d27 308 309 pld [r10, r2] @11+ 1*strd 310 vmlal.u8 q6,d16,d28 311 312 pld [r10, r2, lsl #1] @11+ 2*strd 313 vmlsl.u8 q6,d17,d29 314 315 add r10, r10, r2 @12*strd 316 vqrshrun.s16 d10,q5,#6 @sto_res = vqmovun_s16(sto_res_tmp)@ 317 318 pld [r10, r2, lsl #1] @11+ 3*strd 319 vmull.u8 q7,d4,d23 320 321 @ mov r10, r11 322 vmlsl.u8 q7,d3,d22 323 324 subs r7,r7,#4 325 vmlsl.u8 q7,d5,d24 326 327 vmlal.u8 q7,d6,d25 328 vld1.u8 {d3},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@ 329 vmlal.u8 q7,d7,d26 330 vld1.u8 {d4},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@ 331 vmlsl.u8 q7,d16,d27 332 vld1.u8 {d5},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 333 vmlal.u8 q7,d17,d28 334 vld1.u8 {d6},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 335 vmlsl.u8 q7,d18,d29 336 vld1.u8 {d7},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@ 337 338 vqrshrun.s16 d12,q6,#6 339 vst1.8 {d10},[r14],r6 @vst1_u8(pu1_dst_tmp,sto_res)@ 340 341 342 343 bgt kernel_8 @jumps to kernel_8 344 345 epilog: 346 347 vmull.u8 q4,d1,d23 @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@ 348 vmlsl.u8 q4,d0,d22 @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@ 349 vmlsl.u8 q4,d2,d24 @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@ 350 vmlal.u8 q4,d3,d25 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@ 351 vmlal.u8 q4,d4,d26 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@ 352 vmlsl.u8 q4,d5,d27 @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@ 353 vmlal.u8 q4,d6,d28 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@ 354 vmlsl.u8 q4,d7,d29 @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@ 355 vst1.8 {d12},[r14],r6 356 357 vqrshrun.s16 d14,q7,#6 358 359 vld1.u8 {d16},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@ 360 vmull.u8 q5,d2,d23 @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@ 361 vmlsl.u8 q5,d1,d22 @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@ 362 vmlsl.u8 q5,d3,d24 @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@ 363 vmlal.u8 q5,d4,d25 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@ 364 vmlal.u8 q5,d5,d26 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@ 365 vmlsl.u8 q5,d6,d27 @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@ 366 vmlal.u8 q5,d7,d28 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@ 367 vmlsl.u8 q5,d16,d29 @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@ 368 vst1.8 {d14},[r14],r6 369 370 vqrshrun.s16 d8,q4,#6 @sto_res = vqmovun_s16(sto_res_tmp)@ 371 372 vld1.u8 {d17},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 373 vmull.u8 q6,d3,d23 374 vmlsl.u8 q6,d2,d22 375 vmlsl.u8 q6,d4,d24 376 vmlal.u8 q6,d5,d25 377 vmlal.u8 q6,d6,d26 378 vmlsl.u8 q6,d7,d27 379 vmlal.u8 q6,d16,d28 380 vmlsl.u8 q6,d17,d29 381 add r14,r1,r6 382 vst1.8 {d8},[r1]! @vst1_u8(pu1_dst,sto_res)@ 383 vqrshrun.s16 d10,q5,#6 @sto_res = vqmovun_s16(sto_res_tmp)@ 384 385 vld1.u8 {d18},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 386 vmull.u8 q7,d4,d23 387 vmlsl.u8 q7,d3,d22 388 vmlsl.u8 q7,d5,d24 389 vmlal.u8 q7,d6,d25 390 vmlal.u8 q7,d7,d26 391 vmlsl.u8 q7,d16,d27 392 vmlal.u8 q7,d17,d28 393 vmlsl.u8 q7,d18,d29 394 395 vst1.8 {d10},[r14],r6 @vst1_u8(pu1_dst_tmp,sto_res)@ 396 vqrshrun.s16 d12,q6,#6 397 398 epilog_end: 399 vst1.8 {d12},[r14],r6 400 vqrshrun.s16 d14,q7,#6 401 402 vst1.8 {d14},[r14],r6 403 404 405 end_loops: 406 tst r5,#7 407 ldr r1, [sp], #4 408 ldr r0, [sp], #4 409 410 ldmfdeq sp!,{r4-r12,r15} @reload the registers from sp 411 mov r5, #4 412 add r0, r0, #8 413 add r1, r1, #8 414 mov r7, #16 415 @ 416 417 core_loop_wd_4: 418 rsb r9,r5,r6,lsl #2 @r6->dst_strd r5 ->wd 419 rsb r8,r5,r2,lsl #2 @r2->src_strd 420 vmov.i8 d4,#0 421 422 outer_loop_wd_4: 423 subs r12,r5,#0 424 ble end_inner_loop_wd_4 @outer loop jump 425 426 inner_loop_wd_4: 427 add r3,r0,r2 428 vld1.u32 {d4[1]},[r3],r2 @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)@ 429 subs r12,r12,#4 430 vdup.u32 d5,d4[1] @src_tmp2 = vdup_lane_u32(src_tmp1, 1)@ 431 vld1.u32 {d5[1]},[r3],r2 @src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)@ 432 vld1.u32 {d4[0]},[r0] @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0)@ 433 vmull.u8 q0,d5,d23 @mul_res1 = vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)@ 434 435 vdup.u32 d6,d5[1] @src_tmp3 = vdup_lane_u32(src_tmp2, 1)@ 436 add r0,r0,#4 437 vld1.u32 {d6[1]},[r3],r2 @src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)@ 438 vmlsl.u8 q0,d4,d22 @mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)@ 439 440 vdup.u32 d7,d6[1] @src_tmp4 = vdup_lane_u32(src_tmp3, 1)@ 441 vld1.u32 {d7[1]},[r3],r2 @src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)@ 442 vmlsl.u8 q0,d6,d24 @mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)@ 443 444 vmull.u8 q4,d7,d23 445 vdup.u32 d4,d7[1] @src_tmp1 = vdup_lane_u32(src_tmp4, 1)@ 446 vmull.u8 q1,d7,d25 @mul_res2 = vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3)@ 447 vld1.u32 {d4[1]},[r3],r2 @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)@ 448 vmlsl.u8 q4,d6,d22 449 vmlal.u8 q0,d4,d26 @mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_4)@ 450 451 vdup.u32 d5,d4[1] @src_tmp2 = vdup_lane_u32(src_tmp1, 1)@ 452 vmlsl.u8 q4,d4,d24 453 vld1.u32 {d5[1]},[r3],r2 @src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)@ 454 vmlsl.u8 q1,d5,d27 @mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp2), coeffabs_5)@ 455 456 vdup.u32 d6,d5[1] @src_tmp3 = vdup_lane_u32(src_tmp2, 1)@ 457 vmlal.u8 q4,d5,d25 458 vld1.u32 {d6[1]},[r3],r2 @src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)@ 459 vmlal.u8 q0,d6,d28 @mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_6)@ 460 461 vdup.u32 d7,d6[1] @src_tmp4 = vdup_lane_u32(src_tmp3, 1)@ 462 vmlal.u8 q4,d6,d26 463 vld1.u32 {d7[1]},[r3],r2 @src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)@ 464 vmlsl.u8 q1,d7,d29 @mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp4), coeffabs_7)@ 465 466 vdup.u32 d4,d7[1] 467 vadd.i16 q0,q0,q1 @mul_res1 = vaddq_u16(mul_res1, mul_res2)@ 468 469 vmlsl.u8 q4,d7,d27 470 vld1.u32 {d4[1]},[r3],r2 471 vmlal.u8 q4,d4,d28 472 vdup.u32 d5,d4[1] 473 vqrshrun.s16 d0,q0,#6 @sto_res = vqmovun_s16(sto_res_tmp)@ 474 475 vld1.u32 {d5[1]},[r3] 476 add r3,r1,r6 477 vst1.32 {d0[0]},[r1] @vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0)@ 478 479 vmlsl.u8 q4,d5,d29 480 vst1.32 {d0[1]},[r3],r6 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1)@ 481 vqrshrun.s16 d8,q4,#6 482 483 vst1.32 {d8[0]},[r3],r6 484 add r1,r1,#4 485 vst1.32 {d8[1]},[r3] 486 bgt inner_loop_wd_4 487 488 end_inner_loop_wd_4: 489 subs r7,r7,#4 490 add r1,r1,r9 491 add r0,r0,r8 492 bgt outer_loop_wd_4 493 494 ldmfd sp!, {r4-r12, r15} @reload the registers from sp 495 496 497 498 @/** 499 @******************************************************************************* 500 @* 501 @* @brief 502 @* interprediction luma filter for vertical 16bit output 503 @* 504 @* @par description: 505 @* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to 506 @* the elements pointed by 'pu1_src' and writes to the location pointed by 507 @* 'pu1_dst' no downshifting or clipping is done and the output is used as 508 @* an input for weighted prediction assumptions : the function is optimized 509 @* considering the fact width is multiple of 4 or 8. and height as multiple 510 @* of 2. 511 @* 512 @* @param[in] pu1_src 513 @* uword8 pointer to the source 514 @* 515 @* @param[out] pi2_dst 516 @* word16 pointer to the destination 517 @* 518 @* @param[in] src_strd 519 @* integer source stride 520 @* 521 @* @param[in] dst_strd 522 @* integer destination stride 523 @* 524 @* @param[in] pi1_coeff 525 @* word8 pointer to the filter coefficients 526 @* 527 @* @param[in] ht 528 @* integer height of the array 529 @* 530 @* @param[in] wd 531 @* integer width of the array 532 @* 533 @* @returns 534 @* 535 @* @remarks 536 @* none 537 @* 538 @******************************************************************************* 539 @*/ 540 541 @void ihevc_inter_pred_luma_vert_w16out(uword8 *pu1_src, 542 @ word16 *pi2_dst, 543 @ word32 src_strd, 544 @ word32 dst_strd, 545 @ word8 *pi1_coeff, 546 @ word32 ht, 547 @ word32 wd ) 548 549 @**************variables vs registers***************************************** 550 @ r0 => *pu1_src 551 @ r1 => *pu1_dst 552 @ r2 => src_strd 553 @ r6 => dst_strd 554 @ r12 => *pi1_coeff 555 @ r5 => ht 556 @ r3 => wd 557 558 559 560 .globl ihevc_inter_pred_luma_vert_w16out_a9q 561 562 .type ihevc_inter_pred_luma_vert_w16out_a9q, %function 563 564 ihevc_inter_pred_luma_vert_w16out_a9q: 565 566 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 567 568 ldr r12,[sp,#40] @load pi1_coeff 569 mov r6,r3 570 ldr r5,[sp,#48] @load wd 571 vld1.u8 {d0},[r12] @coeff = vld1_s8(pi1_coeff) 572 sub r12,r2,r2,lsl #2 @src_ctrd & pi1_coeff 573 vabs.s8 d0,d0 @vabs_s8(coeff) 574 add r0,r0,r12 @r0->pu1_src r12->pi1_coeff 575 ldr r3,[sp,#44] @load ht 576 subs r7,r3,#0 @r3->ht 577 @ble end_loops_16out @end loop jump 578 vdup.u8 d22,d0[0] @coeffabs_0 = vdup_lane_u8(coeffabs, 0)@ 579 cmp r5,#8 580 vdup.u8 d23,d0[1] @coeffabs_1 = vdup_lane_u8(coeffabs, 1)@ 581 vdup.u8 d24,d0[2] @coeffabs_2 = vdup_lane_u8(coeffabs, 2)@ 582 vdup.u8 d25,d0[3] @coeffabs_3 = vdup_lane_u8(coeffabs, 3)@ 583 vdup.u8 d26,d0[4] @coeffabs_4 = vdup_lane_u8(coeffabs, 4)@ 584 vdup.u8 d27,d0[5] @coeffabs_5 = vdup_lane_u8(coeffabs, 5)@ 585 vdup.u8 d28,d0[6] @coeffabs_6 = vdup_lane_u8(coeffabs, 6)@ 586 vdup.u8 d29,d0[7] @coeffabs_7 = vdup_lane_u8(coeffabs, 7)@ 587 blt core_loop_wd_4_16out @core loop wd 4 jump 588 str r0, [sp, #-4]! 589 str r1, [sp, #-4]! 590 591 bic r4,r5,#7 @r5 ->wd 592 rsb r9,r4,r6,lsl #2 @r6->dst_strd r5 ->wd 593 rsb r8,r4,r2,lsl #2 @r2->src_strd 594 mov r6, r6, lsl #1 595 mov r3, r5, lsr #3 @divide by 8 596 mul r7, r3 @multiply height by width 597 sub r7, #4 @subtract by one for epilog 598 599 prolog_16out: 600 601 and r10, r0, #31 602 add r3,r0,r2 @pu1_src_tmp += src_strd@ 603 604 vld1.u8 {d1},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 605 vld1.u8 {d0},[r0]! @src_tmp1 = vld1_u8(pu1_src_tmp)@ 606 subs r4,r4,#8 607 vld1.u8 {d2},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 608 vmull.u8 q4,d1,d23 @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@ 609 vld1.u8 {d3},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@ 610 vmlsl.u8 q4,d0,d22 @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@ 611 vld1.u8 {d4},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@ 612 vmlsl.u8 q4,d2,d24 @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@ 613 vld1.u8 {d5},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 614 vmlal.u8 q4,d3,d25 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@ 615 vld1.u8 {d6},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 616 vmlal.u8 q4,d4,d26 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@ 617 vld1.u8 {d7},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@ 618 vmlsl.u8 q4,d5,d27 @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@ 619 vld1.u8 {d16},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@ 620 vmlal.u8 q4,d6,d28 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@ 621 vld1.u8 {d17},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 622 vmlsl.u8 q4,d7,d29 @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@ 623 624 625 addle r0,r0,r8 626 vmull.u8 q5,d2,d23 @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@ 627 628 bicle r4,r5,#7 @r5 ->wd 629 vmlsl.u8 q5,d1,d22 @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@ 630 631 vld1.u8 {d18},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 632 vmlsl.u8 q5,d3,d24 @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@ 633 634 pld [r3] 635 vmlal.u8 q5,d4,d25 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@ 636 pld [r3, r2] 637 vmlal.u8 q5,d5,d26 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@ 638 pld [r3, r2, lsl #1] 639 vmlsl.u8 q5,d6,d27 @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@ 640 add r3, r3, r2 641 vmlal.u8 q5,d7,d28 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@ 642 pld [r3, r2, lsl #1] 643 vmlsl.u8 q5,d16,d29 @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@ 644 645 add r3,r0,r2 @pu1_src_tmp += src_strd@ 646 vmull.u8 q6,d3,d23 647 vld1.u8 {d1},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 648 vmlsl.u8 q6,d2,d22 649 vld1.u8 {d0},[r0]! @src_tmp1 = vld1_u8(pu1_src_tmp)@ 650 vmlsl.u8 q6,d4,d24 651 vld1.u8 {d2},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 652 vmlal.u8 q6,d5,d25 653 vmlal.u8 q6,d6,d26 654 vmlsl.u8 q6,d7,d27 655 vmlal.u8 q6,d16,d28 656 vmlsl.u8 q6,d17,d29 657 add r14,r1,r6 658 vst1.8 {d8, d9},[r1]! @vst1_u8(pu1_dst,sto_res)@ 659 @vqrshrun.s16 d10,q5,#6 @sto_res = vqmovun_s16(sto_res_tmp)@ 660 addle r1,r1,r9,lsl #1 661 662 vmull.u8 q7,d4,d23 663 subs r7,r7,#4 664 vmlsl.u8 q7,d3,d22 665 vmlsl.u8 q7,d5,d24 666 vmlal.u8 q7,d6,d25 667 vld1.u8 {d3},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@ 668 vmlal.u8 q7,d7,d26 669 vld1.u8 {d4},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@ 670 vmlsl.u8 q7,d16,d27 671 vld1.u8 {d5},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 672 vmlal.u8 q7,d17,d28 673 vld1.u8 {d6},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 674 vmlsl.u8 q7,d18,d29 675 vld1.u8 {d7},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@ 676 677 vst1.8 {d10, d11},[r14],r6 @vst1_u8(pu1_dst_tmp,sto_res)@ 678 @vqrshrun.s16 d12,q6,#6 679 680 681 blt epilog_end_16out 682 beq epilog_16out @jumps to epilog 683 684 kernel_8_16out: 685 686 subs r4,r4,#8 687 vmull.u8 q4,d1,d23 @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@ 688 689 addle r0,r0,r8 690 vmlsl.u8 q4,d0,d22 @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@ 691 692 vld1.u8 {d16},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@ 693 vmlsl.u8 q4,d2,d24 @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@ 694 695 vld1.u8 {d17},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 696 vmlal.u8 q4,d3,d25 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@ 697 698 bicle r4,r5,#7 @r5 ->wd 699 vmlal.u8 q4,d4,d26 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@ 700 701 vld1.u8 {d18},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 702 vmlsl.u8 q4,d5,d27 @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@ 703 704 vst1.8 {d12,d13},[r14],r6 705 vmlal.u8 q4,d6,d28 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@ 706 707 add r3,r0,r2 @pu1_src_tmp += src_strd@ 708 vmlsl.u8 q4,d7,d29 @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@ 709 710 711 @ and r11, r0, #31 712 vmull.u8 q5,d2,d23 @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@ 713 714 vst1.8 {d14,d15},[r14],r6 715 vmlsl.u8 q5,d1,d22 @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@ 716 717 add r14,r1,r6 718 vmlsl.u8 q5,d3,d24 @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@ 719 720 vld1.u8 {d0},[r0]! @src_tmp1 = vld1_u8(pu1_src_tmp)@ 721 vmlal.u8 q5,d4,d25 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@ 722 723 vld1.u8 {d1},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 724 vmlal.u8 q5,d5,d26 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@ 725 726 vst1.8 {d8,d9},[r1]! @vst1_u8(pu1_dst,sto_res)@ 727 vmlsl.u8 q5,d6,d27 @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@ 728 729 addle r1,r1,r9,lsl #1 730 vmlal.u8 q5,d7,d28 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@ 731 732 @ cmp r11, r10 733 vmlsl.u8 q5,d16,d29 @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@ 734 735 add r10, r3, r2, lsl #3 @ 10*strd - 8+2 736 vmull.u8 q6,d3,d23 737 738 add r10, r10, r2 @ 11*strd 739 vmlsl.u8 q6,d2,d22 740 741 pld [r10] @11+ 0 742 vmlsl.u8 q6,d4,d24 743 744 pld [r10, r2] @11+ 1*strd 745 vmlal.u8 q6,d5,d25 746 747 pld [r10, r2, lsl #1] @11+ 2*strd 748 vmlal.u8 q6,d6,d26 749 750 add r10, r10, r2 @12*strd 751 vmlsl.u8 q6,d7,d27 752 753 pld [r10, r2, lsl #1] @11+ 3*strd 754 vmlal.u8 q6,d16,d28 755 756 @ mov r10, r11 757 vmlsl.u8 q6,d17,d29 758 759 vld1.u8 {d2},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 760 vmull.u8 q7,d4,d23 761 762 subs r7,r7,#4 763 vmlsl.u8 q7,d3,d22 764 765 vst1.8 {d10, d11},[r14],r6 @vst1_u8(pu1_dst_tmp,sto_res)@ 766 vmlsl.u8 q7,d5,d24 767 768 vld1.u8 {d3},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@ 769 vmlal.u8 q7,d6,d25 770 771 vld1.u8 {d4},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@ 772 vmlal.u8 q7,d7,d26 773 774 vld1.u8 {d5},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 775 vmlsl.u8 q7,d16,d27 776 777 vld1.u8 {d6},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 778 vmlal.u8 q7,d17,d28 779 780 vld1.u8 {d7},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@ 781 vmlsl.u8 q7,d18,d29 782 783 784 bgt kernel_8_16out @jumps to kernel_8 785 786 epilog_16out: 787 788 vmull.u8 q4,d1,d23 @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@ 789 vmlsl.u8 q4,d0,d22 @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@ 790 vmlsl.u8 q4,d2,d24 @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@ 791 vmlal.u8 q4,d3,d25 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@ 792 vmlal.u8 q4,d4,d26 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@ 793 vmlsl.u8 q4,d5,d27 @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@ 794 vmlal.u8 q4,d6,d28 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@ 795 vmlsl.u8 q4,d7,d29 @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@ 796 vst1.8 {d12,d13},[r14],r6 797 798 @vqrshrun.s16 d14,q7,#6 799 800 vld1.u8 {d16},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@ 801 vmull.u8 q5,d2,d23 @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@ 802 vmlsl.u8 q5,d1,d22 @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@ 803 vmlsl.u8 q5,d3,d24 @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@ 804 vmlal.u8 q5,d4,d25 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@ 805 vmlal.u8 q5,d5,d26 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@ 806 vmlsl.u8 q5,d6,d27 @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@ 807 vmlal.u8 q5,d7,d28 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@ 808 vmlsl.u8 q5,d16,d29 @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@ 809 vst1.8 {d14,d15},[r14],r6 810 811 @vqrshrun.s16 d8,q4,#6 @sto_res = vqmovun_s16(sto_res_tmp)@ 812 813 vld1.u8 {d17},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 814 vmull.u8 q6,d3,d23 815 vmlsl.u8 q6,d2,d22 816 vmlsl.u8 q6,d4,d24 817 vmlal.u8 q6,d5,d25 818 vmlal.u8 q6,d6,d26 819 vmlsl.u8 q6,d7,d27 820 vmlal.u8 q6,d16,d28 821 vmlsl.u8 q6,d17,d29 822 add r14,r1,r6 823 vst1.8 {d8,d9},[r1]! @vst1_u8(pu1_dst,sto_res)@ 824 @vqrshrun.s16 d10,q5,#6 @sto_res = vqmovun_s16(sto_res_tmp)@ 825 826 vld1.u8 {d18},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 827 vmull.u8 q7,d4,d23 828 vmlsl.u8 q7,d3,d22 829 vmlsl.u8 q7,d5,d24 830 vmlal.u8 q7,d6,d25 831 vmlal.u8 q7,d7,d26 832 vmlsl.u8 q7,d16,d27 833 vmlal.u8 q7,d17,d28 834 vmlsl.u8 q7,d18,d29 835 836 vst1.8 {d10,d11},[r14],r6 @vst1_u8(pu1_dst_tmp,sto_res)@ 837 @vqrshrun.s16 d12,q6,#6 838 839 epilog_end_16out: 840 vst1.8 {d12,d13},[r14],r6 841 @vqrshrun.s16 d14,q7,#6 842 843 vst1.8 {d14,d15},[r14],r6 844 845 846 end_loops_16out: 847 tst r5,#7 848 ldr r1, [sp], #4 849 ldr r0, [sp], #4 850 851 ldmfdeq sp!,{r4-r12,r15} @reload the registers from sp 852 mov r5, #4 853 add r0, r0, #8 854 add r1, r1, #16 855 mov r7, #16 856 mov r6, r6, lsr #1 857 858 @ 859 860 core_loop_wd_4_16out: 861 rsb r9,r5,r6,lsl #2 @r6->dst_strd r5 ->wd 862 rsb r8,r5,r2,lsl #2 @r2->src_strd 863 vmov.i8 d4,#0 864 mov r6, r6, lsl #1 865 866 outer_loop_wd_4_16out: 867 subs r12,r5,#0 868 ble end_inner_loop_wd_4_16out @outer loop jump 869 870 inner_loop_wd_4_16out: 871 add r3,r0,r2 872 vld1.u32 {d4[1]},[r3],r2 @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)@ 873 subs r12,r12,#4 874 vdup.u32 d5,d4[1] @src_tmp2 = vdup_lane_u32(src_tmp1, 1)@ 875 vld1.u32 {d5[1]},[r3],r2 @src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)@ 876 vld1.u32 {d4[0]},[r0] @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0)@ 877 vmull.u8 q0,d5,d23 @mul_res1 = vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)@ 878 879 vdup.u32 d6,d5[1] @src_tmp3 = vdup_lane_u32(src_tmp2, 1)@ 880 add r0,r0,#4 881 vld1.u32 {d6[1]},[r3],r2 @src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)@ 882 vmlsl.u8 q0,d4,d22 @mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)@ 883 884 vdup.u32 d7,d6[1] @src_tmp4 = vdup_lane_u32(src_tmp3, 1)@ 885 vld1.u32 {d7[1]},[r3],r2 @src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)@ 886 vmlsl.u8 q0,d6,d24 @mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)@ 887 888 vmull.u8 q4,d7,d23 889 vdup.u32 d4,d7[1] @src_tmp1 = vdup_lane_u32(src_tmp4, 1)@ 890 vmull.u8 q1,d7,d25 @mul_res2 = vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3)@ 891 vld1.u32 {d4[1]},[r3],r2 @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)@ 892 vmlsl.u8 q4,d6,d22 893 vmlal.u8 q0,d4,d26 @mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_4)@ 894 895 vdup.u32 d5,d4[1] @src_tmp2 = vdup_lane_u32(src_tmp1, 1)@ 896 vmlsl.u8 q4,d4,d24 897 vld1.u32 {d5[1]},[r3],r2 @src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)@ 898 vmlsl.u8 q1,d5,d27 @mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp2), coeffabs_5)@ 899 900 vdup.u32 d6,d5[1] @src_tmp3 = vdup_lane_u32(src_tmp2, 1)@ 901 vmlal.u8 q4,d5,d25 902 vld1.u32 {d6[1]},[r3],r2 @src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)@ 903 vmlal.u8 q0,d6,d28 @mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_6)@ 904 905 vdup.u32 d7,d6[1] @src_tmp4 = vdup_lane_u32(src_tmp3, 1)@ 906 vmlal.u8 q4,d6,d26 907 vld1.u32 {d7[1]},[r3],r2 @src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)@ 908 vmlsl.u8 q1,d7,d29 @mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp4), coeffabs_7)@ 909 910 vdup.u32 d4,d7[1] 911 vadd.i16 q0,q0,q1 @mul_res1 = vaddq_u16(mul_res1, mul_res2)@ 912 913 vmlsl.u8 q4,d7,d27 914 vld1.u32 {d4[1]},[r3],r2 915 vmlal.u8 q4,d4,d28 916 vdup.u32 d5,d4[1] 917 @vqrshrun.s16 d0,q0,#6 @sto_res = vqmovun_s16(sto_res_tmp)@ 918 919 vld1.u32 {d5[1]},[r3] 920 add r3,r1,r6 921 vst1.32 {d0},[r1]! @vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0)@ 922 923 vmlsl.u8 q4,d5,d29 924 vst1.32 {d1},[r3],r6 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1)@ 925 @vqrshrun.s16 d8,q4,#6 926 927 vst1.32 {d8},[r3],r6 928 @add r1,r1,#4 929 vst1.32 {d9},[r3] 930 bgt inner_loop_wd_4_16out 931 932 end_inner_loop_wd_4_16out: 933 subs r7,r7,#4 934 add r1,r1,r9,lsl #1 935 add r0,r0,r8 936 bgt outer_loop_wd_4_16out 937 938 ldmfd sp!, {r4-r12, r15} @reload the registers from sp 939 940 941 942 943 944 945 946 947 948