1 @/***************************************************************************** 2 @* 3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 @* 5 @* Licensed under the Apache License, Version 2.0 (the "License"); 6 @* you may not use this file except in compliance with the License. 7 @* You may obtain a copy of the License at: 8 @* 9 @* http://www.apache.org/licenses/LICENSE-2.0 10 @* 11 @* Unless required by applicable law or agreed to in writing, software 12 @* distributed under the License is distributed on an "AS IS" BASIS, 13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 @* See the License for the specific language governing permissions and 15 @* limitations under the License. 16 @* 17 @*****************************************************************************/ 18 @/** 19 @****************************************************************************** 20 @* @file 21 @* ihevc_inter_pred_filters_luma_vert.s 22 @* 23 @* @brief 24 @* contains function definitions for inter prediction interpolation. 25 @* functions are coded using neon intrinsics and can be compiled using 26 27 @* rvct 28 @* 29 @* @author 30 @* parthiban v 31 @* 32 @* @par list of functions: 33 @* 34 @* - ihevc_inter_pred_luma_vert() 35 @* 36 @* @remarks 37 @* none 38 @* 39 @******************************************************************************* 40 @*/ 41 42 @/* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */ 43 @/* include reconstruction */ 44 45 46 47 @/** 48 @******************************************************************************* 49 @* 50 @* @brief 51 @* interprediction luma filter for vertical input 52 @* 53 @* @par description: 54 @* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to 55 @* the elements pointed by 'pu1_src' and writes to the location pointed by 56 @* 'pu1_dst' the output is downshifted by 6 and clipped to 8 bits 57 @* assumptions : the function is optimized considering the fact width is 58 @* multiple of 4 or 8. and height as multiple of 2. 59 @* 60 @* @param[in] pu1_src 61 @* uword8 pointer to the source 62 @* 63 @* @param[out] pu1_dst 64 @* uword8 pointer to the destination 65 @* 66 @* @param[in] src_strd 67 @* integer source stride 68 @* 69 @* @param[in] dst_strd 70 @* integer destination stride 71 @* 72 @* @param[in] pi1_coeff 73 @* word8 pointer to the filter coefficients 74 @* 75 @* @param[in] ht 76 @* integer height of the array 77 @* 78 @* @param[in] wd 79 @* integer width of the array 80 @* 81 @* @returns 82 @* 83 @* @remarks 84 @* none 85 @* 86 @******************************************************************************* 87 @*/ 88 89 @void ihevc_inter_pred_luma_vert ( 90 @ uword8 *pu1_src, 91 @ uword8 *pu1_dst, 92 @ word32 src_strd, 93 @ word32 dst_strd, 94 @ word8 *pi1_coeff, 95 @ word32 ht, 96 @ word32 wd ) 97 98 @**************variables vs registers***************************************** 99 @ r0 => *pu1_src 100 @ r1 => *pu1_dst 101 @ r2 => src_strd 102 @ r6 => dst_strd 103 @ r12 => *pi1_coeff 104 @ r5 => ht 105 @ r3 => wd 106 107 .equ coeff_offset, 104 108 .equ ht_offset, 108 109 .equ wd_offset, 112 110 111 .text 112 .align 4 113 .syntax unified 114 115 116 117 .globl ihevc_inter_pred_luma_vert_a9q 118 119 .type ihevc_inter_pred_luma_vert_a9q, %function 120 121 ihevc_inter_pred_luma_vert_a9q: 122 123 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 124 vpush {d8 - d15} 125 126 ldr r12,[sp,#coeff_offset] @load pi1_coeff 127 mov r6,r3 128 ldr r5,[sp,#wd_offset] @load wd 129 vld1.u8 {d0},[r12] @coeff = vld1_s8(pi1_coeff) 130 sub r12,r2,r2,lsl #2 @src_ctrd & pi1_coeff 131 vabs.s8 d0,d0 @vabs_s8(coeff) 132 add r0,r0,r12 @r0->pu1_src r12->pi1_coeff 133 ldr r3,[sp,#ht_offset] @load ht 134 subs r7,r3,#0 @r3->ht 135 @ble end_loops @end loop jump 136 vdup.u8 d22,d0[0] @coeffabs_0 = vdup_lane_u8(coeffabs, 0)@ 137 cmp r5,#8 138 vdup.u8 d23,d0[1] @coeffabs_1 = vdup_lane_u8(coeffabs, 1)@ 139 vdup.u8 d24,d0[2] @coeffabs_2 = vdup_lane_u8(coeffabs, 2)@ 140 vdup.u8 d25,d0[3] @coeffabs_3 = vdup_lane_u8(coeffabs, 3)@ 141 vdup.u8 d26,d0[4] @coeffabs_4 = vdup_lane_u8(coeffabs, 4)@ 142 vdup.u8 d27,d0[5] @coeffabs_5 = vdup_lane_u8(coeffabs, 5)@ 143 vdup.u8 d28,d0[6] @coeffabs_6 = vdup_lane_u8(coeffabs, 6)@ 144 vdup.u8 d29,d0[7] @coeffabs_7 = vdup_lane_u8(coeffabs, 7)@ 145 blt core_loop_wd_4 @core loop wd 4 jump 146 str r0, [sp, #-4]! 147 str r1, [sp, #-4]! 148 149 bic r4,r5,#7 @r5 ->wd 150 rsb r9,r4,r6,lsl #2 @r6->dst_strd r5 ->wd 151 rsb r8,r4,r2,lsl #2 @r2->src_strd 152 mov r3, r5, lsr #3 @divide by 8 153 mul r7, r3 @multiply height by width 154 sub r7, #4 @subtract by one for epilog 155 156 prolog: 157 158 and r10, r0, #31 159 add r3,r0,r2 @pu1_src_tmp += src_strd@ 160 vld1.u8 {d1},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 161 vld1.u8 {d0},[r0]! @src_tmp1 = vld1_u8(pu1_src_tmp)@ 162 subs r4,r4,#8 163 vld1.u8 {d2},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 164 vmull.u8 q4,d1,d23 @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@ 165 vld1.u8 {d3},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@ 166 vmlsl.u8 q4,d0,d22 @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@ 167 vld1.u8 {d4},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@ 168 vmlsl.u8 q4,d2,d24 @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@ 169 vld1.u8 {d5},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 170 vmlal.u8 q4,d3,d25 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@ 171 vld1.u8 {d6},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 172 vmlal.u8 q4,d4,d26 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@ 173 vld1.u8 {d7},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@ 174 vmlsl.u8 q4,d5,d27 @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@ 175 vld1.u8 {d16},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@ 176 vmlal.u8 q4,d6,d28 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@ 177 vld1.u8 {d17},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 178 vmlsl.u8 q4,d7,d29 @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@ 179 180 181 vld1.u8 {d18},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 182 vmull.u8 q5,d2,d23 @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@ 183 184 addle r0,r0,r8 185 vmlsl.u8 q5,d1,d22 @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@ 186 187 bicle r4,r5,#7 @r5 ->wd 188 vmlsl.u8 q5,d3,d24 @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@ 189 190 pld [r3] 191 vmlal.u8 q5,d4,d25 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@ 192 pld [r3, r2] 193 vmlal.u8 q5,d5,d26 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@ 194 pld [r3, r2, lsl #1] 195 vmlsl.u8 q5,d6,d27 @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@ 196 197 add r3, r3, r2 198 vmlal.u8 q5,d7,d28 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@ 199 200 pld [r3, r2, lsl #1] 201 vmlsl.u8 q5,d16,d29 @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@ 202 203 add r3,r0,r2 @pu1_src_tmp += src_strd@ 204 vqrshrun.s16 d8,q4,#6 @sto_res = vqmovun_s16(sto_res_tmp)@ 205 206 vld1.u8 {d1},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 207 vmull.u8 q6,d3,d23 208 vld1.u8 {d0},[r0]! @src_tmp1 = vld1_u8(pu1_src_tmp)@ 209 vmlsl.u8 q6,d2,d22 210 vld1.u8 {d2},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 211 vmlsl.u8 q6,d4,d24 212 vmlal.u8 q6,d5,d25 213 vmlal.u8 q6,d6,d26 214 vmlsl.u8 q6,d7,d27 215 vmlal.u8 q6,d16,d28 216 vmlsl.u8 q6,d17,d29 217 add r14,r1,r6 218 vst1.8 {d8},[r1]! @vst1_u8(pu1_dst,sto_res)@ 219 vqrshrun.s16 d10,q5,#6 @sto_res = vqmovun_s16(sto_res_tmp)@ 220 addle r1,r1,r9 221 222 vmull.u8 q7,d4,d23 223 subs r7,r7,#4 224 vmlsl.u8 q7,d3,d22 225 vmlsl.u8 q7,d5,d24 226 vmlal.u8 q7,d6,d25 227 vld1.u8 {d3},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@ 228 vmlal.u8 q7,d7,d26 229 vld1.u8 {d4},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@ 230 vmlsl.u8 q7,d16,d27 231 vld1.u8 {d5},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 232 vmlal.u8 q7,d17,d28 233 vld1.u8 {d6},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 234 vmlsl.u8 q7,d18,d29 235 vld1.u8 {d7},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@ 236 237 vst1.8 {d10},[r14],r6 @vst1_u8(pu1_dst_tmp,sto_res)@ 238 vqrshrun.s16 d12,q6,#6 239 240 241 blt epilog_end @jumps to epilog_end 242 beq epilog @jumps to epilog 243 244 kernel_8: 245 246 subs r4,r4,#8 247 vmull.u8 q4,d1,d23 @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@ 248 249 addle r0,r0,r8 250 vmlsl.u8 q4,d0,d22 @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@ 251 252 bicle r4,r5,#7 @r5 ->wd 253 vmlsl.u8 q4,d2,d24 @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@ 254 255 vld1.u8 {d16},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@ 256 vmlal.u8 q4,d3,d25 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@ 257 258 vld1.u8 {d17},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 259 vmlal.u8 q4,d4,d26 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@ 260 261 vld1.u8 {d18},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 262 vmlsl.u8 q4,d5,d27 @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@ 263 264 vmlal.u8 q4,d6,d28 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@ 265 266 vmlsl.u8 q4,d7,d29 @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@ 267 vst1.8 {d12},[r14],r6 268 269 @ and r11, r0, #31 270 vqrshrun.s16 d14,q7,#6 271 272 add r3,r0,r2 @pu1_src_tmp += src_strd@ 273 vmull.u8 q5,d2,d23 @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@ 274 275 vld1.u8 {d0},[r0]! @src_tmp1 = vld1_u8(pu1_src_tmp)@ 276 vmlsl.u8 q5,d1,d22 @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@ 277 278 vmlsl.u8 q5,d3,d24 @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@ 279 280 vld1.u8 {d1},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 281 vmlal.u8 q5,d4,d25 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@ 282 283 vst1.8 {d14},[r14],r6 284 vmlal.u8 q5,d5,d26 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@ 285 286 add r14,r1,#0 287 vmlsl.u8 q5,d6,d27 @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@ 288 289 add r1, r1, #8 290 vmlal.u8 q5,d7,d28 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@ 291 292 vmlsl.u8 q5,d16,d29 @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@ 293 294 addle r1,r1,r9 295 vqrshrun.s16 d8,q4,#6 @sto_res = vqmovun_s16(sto_res_tmp)@ 296 297 @ cmp r11, r10 298 vmull.u8 q6,d3,d23 299 300 add r10, r3, r2, lsl #3 @ 10*strd - 8+2 301 vmlsl.u8 q6,d2,d22 302 303 add r10, r10, r2 @ 11*strd 304 vmlsl.u8 q6,d4,d24 305 306 vld1.u8 {d2},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 307 vmlal.u8 q6,d5,d25 308 309 vmlal.u8 q6,d6,d26 310 vst1.8 {d8},[r14],r6 @vst1_u8(pu1_dst,sto_res)@ 311 312 pld [r10] @11+ 0 313 vmlsl.u8 q6,d7,d27 314 315 pld [r10, r2] @11+ 1*strd 316 vmlal.u8 q6,d16,d28 317 318 pld [r10, r2, lsl #1] @11+ 2*strd 319 vmlsl.u8 q6,d17,d29 320 321 add r10, r10, r2 @12*strd 322 vqrshrun.s16 d10,q5,#6 @sto_res = vqmovun_s16(sto_res_tmp)@ 323 324 pld [r10, r2, lsl #1] @11+ 3*strd 325 vmull.u8 q7,d4,d23 326 327 @ mov r10, r11 328 vmlsl.u8 q7,d3,d22 329 330 subs r7,r7,#4 331 vmlsl.u8 q7,d5,d24 332 333 vmlal.u8 q7,d6,d25 334 vld1.u8 {d3},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@ 335 vmlal.u8 q7,d7,d26 336 vld1.u8 {d4},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@ 337 vmlsl.u8 q7,d16,d27 338 vld1.u8 {d5},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 339 vmlal.u8 q7,d17,d28 340 vld1.u8 {d6},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 341 vmlsl.u8 q7,d18,d29 342 vld1.u8 {d7},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@ 343 344 vqrshrun.s16 d12,q6,#6 345 vst1.8 {d10},[r14],r6 @vst1_u8(pu1_dst_tmp,sto_res)@ 346 347 348 349 bgt kernel_8 @jumps to kernel_8 350 351 epilog: 352 353 vmull.u8 q4,d1,d23 @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@ 354 vmlsl.u8 q4,d0,d22 @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@ 355 vmlsl.u8 q4,d2,d24 @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@ 356 vmlal.u8 q4,d3,d25 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@ 357 vmlal.u8 q4,d4,d26 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@ 358 vmlsl.u8 q4,d5,d27 @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@ 359 vmlal.u8 q4,d6,d28 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@ 360 vmlsl.u8 q4,d7,d29 @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@ 361 vst1.8 {d12},[r14],r6 362 363 vqrshrun.s16 d14,q7,#6 364 365 vld1.u8 {d16},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@ 366 vmull.u8 q5,d2,d23 @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@ 367 vmlsl.u8 q5,d1,d22 @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@ 368 vmlsl.u8 q5,d3,d24 @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@ 369 vmlal.u8 q5,d4,d25 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@ 370 vmlal.u8 q5,d5,d26 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@ 371 vmlsl.u8 q5,d6,d27 @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@ 372 vmlal.u8 q5,d7,d28 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@ 373 vmlsl.u8 q5,d16,d29 @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@ 374 vst1.8 {d14},[r14],r6 375 376 vqrshrun.s16 d8,q4,#6 @sto_res = vqmovun_s16(sto_res_tmp)@ 377 378 vld1.u8 {d17},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 379 vmull.u8 q6,d3,d23 380 vmlsl.u8 q6,d2,d22 381 vmlsl.u8 q6,d4,d24 382 vmlal.u8 q6,d5,d25 383 vmlal.u8 q6,d6,d26 384 vmlsl.u8 q6,d7,d27 385 vmlal.u8 q6,d16,d28 386 vmlsl.u8 q6,d17,d29 387 add r14,r1,r6 388 vst1.8 {d8},[r1]! @vst1_u8(pu1_dst,sto_res)@ 389 vqrshrun.s16 d10,q5,#6 @sto_res = vqmovun_s16(sto_res_tmp)@ 390 391 vld1.u8 {d18},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 392 vmull.u8 q7,d4,d23 393 vmlsl.u8 q7,d3,d22 394 vmlsl.u8 q7,d5,d24 395 vmlal.u8 q7,d6,d25 396 vmlal.u8 q7,d7,d26 397 vmlsl.u8 q7,d16,d27 398 vmlal.u8 q7,d17,d28 399 vmlsl.u8 q7,d18,d29 400 401 vst1.8 {d10},[r14],r6 @vst1_u8(pu1_dst_tmp,sto_res)@ 402 vqrshrun.s16 d12,q6,#6 403 404 epilog_end: 405 vst1.8 {d12},[r14],r6 406 vqrshrun.s16 d14,q7,#6 407 408 vst1.8 {d14},[r14],r6 409 410 411 end_loops: 412 tst r5,#7 413 ldr r1, [sp], #4 414 ldr r0, [sp], #4 415 416 beq end1 417 418 mov r5, #4 419 add r0, r0, #8 420 add r1, r1, #8 421 mov r7, #16 422 @ 423 424 core_loop_wd_4: 425 rsb r9,r5,r6,lsl #2 @r6->dst_strd r5 ->wd 426 rsb r8,r5,r2,lsl #2 @r2->src_strd 427 vmov.i8 d4,#0 428 429 outer_loop_wd_4: 430 subs r12,r5,#0 431 ble end_inner_loop_wd_4 @outer loop jump 432 433 inner_loop_wd_4: 434 add r3,r0,r2 435 vld1.u32 {d4[1]},[r3],r2 @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)@ 436 subs r12,r12,#4 437 vdup.u32 d5,d4[1] @src_tmp2 = vdup_lane_u32(src_tmp1, 1)@ 438 vld1.u32 {d5[1]},[r3],r2 @src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)@ 439 vld1.u32 {d4[0]},[r0] @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0)@ 440 vmull.u8 q0,d5,d23 @mul_res1 = vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)@ 441 442 vdup.u32 d6,d5[1] @src_tmp3 = vdup_lane_u32(src_tmp2, 1)@ 443 add r0,r0,#4 444 vld1.u32 {d6[1]},[r3],r2 @src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)@ 445 vmlsl.u8 q0,d4,d22 @mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)@ 446 447 vdup.u32 d7,d6[1] @src_tmp4 = vdup_lane_u32(src_tmp3, 1)@ 448 vld1.u32 {d7[1]},[r3],r2 @src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)@ 449 vmlsl.u8 q0,d6,d24 @mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)@ 450 451 vmull.u8 q4,d7,d23 452 vdup.u32 d4,d7[1] @src_tmp1 = vdup_lane_u32(src_tmp4, 1)@ 453 vmull.u8 q1,d7,d25 @mul_res2 = vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3)@ 454 vld1.u32 {d4[1]},[r3],r2 @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)@ 455 vmlsl.u8 q4,d6,d22 456 vmlal.u8 q0,d4,d26 @mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_4)@ 457 458 vdup.u32 d5,d4[1] @src_tmp2 = vdup_lane_u32(src_tmp1, 1)@ 459 vmlsl.u8 q4,d4,d24 460 vld1.u32 {d5[1]},[r3],r2 @src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)@ 461 vmlsl.u8 q1,d5,d27 @mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp2), coeffabs_5)@ 462 463 vdup.u32 d6,d5[1] @src_tmp3 = vdup_lane_u32(src_tmp2, 1)@ 464 vmlal.u8 q4,d5,d25 465 vld1.u32 {d6[1]},[r3],r2 @src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)@ 466 vmlal.u8 q0,d6,d28 @mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_6)@ 467 468 vdup.u32 d7,d6[1] @src_tmp4 = vdup_lane_u32(src_tmp3, 1)@ 469 vmlal.u8 q4,d6,d26 470 vld1.u32 {d7[1]},[r3],r2 @src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)@ 471 vmlsl.u8 q1,d7,d29 @mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp4), coeffabs_7)@ 472 473 vdup.u32 d4,d7[1] 474 vadd.i16 q0,q0,q1 @mul_res1 = vaddq_u16(mul_res1, mul_res2)@ 475 476 vmlsl.u8 q4,d7,d27 477 vld1.u32 {d4[1]},[r3],r2 478 vmlal.u8 q4,d4,d28 479 vdup.u32 d5,d4[1] 480 vqrshrun.s16 d0,q0,#6 @sto_res = vqmovun_s16(sto_res_tmp)@ 481 482 vld1.u32 {d5[1]},[r3] 483 add r3,r1,r6 484 vst1.32 {d0[0]},[r1] @vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0)@ 485 486 vmlsl.u8 q4,d5,d29 487 vst1.32 {d0[1]},[r3],r6 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1)@ 488 vqrshrun.s16 d8,q4,#6 489 490 vst1.32 {d8[0]},[r3],r6 491 add r1,r1,#4 492 vst1.32 {d8[1]},[r3] 493 bgt inner_loop_wd_4 494 495 end_inner_loop_wd_4: 496 subs r7,r7,#4 497 add r1,r1,r9 498 add r0,r0,r8 499 bgt outer_loop_wd_4 500 501 end1: 502 vpop {d8 - d15} 503 ldmfd sp!, {r4-r12, r15} @reload the registers from sp 504 505 506 507 @/** 508 @******************************************************************************* 509 @* 510 @* @brief 511 @* interprediction luma filter for vertical 16bit output 512 @* 513 @* @par description: 514 @* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to 515 @* the elements pointed by 'pu1_src' and writes to the location pointed by 516 @* 'pu1_dst' no downshifting or clipping is done and the output is used as 517 @* an input for weighted prediction assumptions : the function is optimized 518 @* considering the fact width is multiple of 4 or 8. and height as multiple 519 @* of 2. 520 @* 521 @* @param[in] pu1_src 522 @* uword8 pointer to the source 523 @* 524 @* @param[out] pi2_dst 525 @* word16 pointer to the destination 526 @* 527 @* @param[in] src_strd 528 @* integer source stride 529 @* 530 @* @param[in] dst_strd 531 @* integer destination stride 532 @* 533 @* @param[in] pi1_coeff 534 @* word8 pointer to the filter coefficients 535 @* 536 @* @param[in] ht 537 @* integer height of the array 538 @* 539 @* @param[in] wd 540 @* integer width of the array 541 @* 542 @* @returns 543 @* 544 @* @remarks 545 @* none 546 @* 547 @******************************************************************************* 548 @*/ 549 550 @void ihevc_inter_pred_luma_vert_w16out(uword8 *pu1_src, 551 @ word16 *pi2_dst, 552 @ word32 src_strd, 553 @ word32 dst_strd, 554 @ word8 *pi1_coeff, 555 @ word32 ht, 556 @ word32 wd ) 557 558 @**************variables vs registers***************************************** 559 @ r0 => *pu1_src 560 @ r1 => *pu1_dst 561 @ r2 => src_strd 562 @ r6 => dst_strd 563 @ r12 => *pi1_coeff 564 @ r5 => ht 565 @ r3 => wd 566 567 568 569 .globl ihevc_inter_pred_luma_vert_w16out_a9q 570 571 .type ihevc_inter_pred_luma_vert_w16out_a9q, %function 572 573 ihevc_inter_pred_luma_vert_w16out_a9q: 574 575 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 576 vpush {d8 - d15} 577 578 ldr r12,[sp,#coeff_offset] @load pi1_coeff 579 mov r6,r3 580 ldr r5,[sp,#wd_offset] @load wd 581 vld1.u8 {d0},[r12] @coeff = vld1_s8(pi1_coeff) 582 sub r12,r2,r2,lsl #2 @src_ctrd & pi1_coeff 583 vabs.s8 d0,d0 @vabs_s8(coeff) 584 add r0,r0,r12 @r0->pu1_src r12->pi1_coeff 585 ldr r3,[sp,#ht_offset] @load ht 586 subs r7,r3,#0 @r3->ht 587 @ble end_loops_16out @end loop jump 588 vdup.u8 d22,d0[0] @coeffabs_0 = vdup_lane_u8(coeffabs, 0)@ 589 cmp r5,#8 590 vdup.u8 d23,d0[1] @coeffabs_1 = vdup_lane_u8(coeffabs, 1)@ 591 vdup.u8 d24,d0[2] @coeffabs_2 = vdup_lane_u8(coeffabs, 2)@ 592 vdup.u8 d25,d0[3] @coeffabs_3 = vdup_lane_u8(coeffabs, 3)@ 593 vdup.u8 d26,d0[4] @coeffabs_4 = vdup_lane_u8(coeffabs, 4)@ 594 vdup.u8 d27,d0[5] @coeffabs_5 = vdup_lane_u8(coeffabs, 5)@ 595 vdup.u8 d28,d0[6] @coeffabs_6 = vdup_lane_u8(coeffabs, 6)@ 596 vdup.u8 d29,d0[7] @coeffabs_7 = vdup_lane_u8(coeffabs, 7)@ 597 blt core_loop_wd_4_16out @core loop wd 4 jump 598 str r0, [sp, #-4]! 599 str r1, [sp, #-4]! 600 601 bic r4,r5,#7 @r5 ->wd 602 rsb r9,r4,r6,lsl #2 @r6->dst_strd r5 ->wd 603 rsb r8,r4,r2,lsl #2 @r2->src_strd 604 mov r6, r6, lsl #1 605 mov r3, r5, lsr #3 @divide by 8 606 mul r7, r3 @multiply height by width 607 sub r7, #4 @subtract by one for epilog 608 609 prolog_16out: 610 611 and r10, r0, #31 612 add r3,r0,r2 @pu1_src_tmp += src_strd@ 613 614 vld1.u8 {d1},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 615 vld1.u8 {d0},[r0]! @src_tmp1 = vld1_u8(pu1_src_tmp)@ 616 subs r4,r4,#8 617 vld1.u8 {d2},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 618 vmull.u8 q4,d1,d23 @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@ 619 vld1.u8 {d3},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@ 620 vmlsl.u8 q4,d0,d22 @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@ 621 vld1.u8 {d4},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@ 622 vmlsl.u8 q4,d2,d24 @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@ 623 vld1.u8 {d5},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 624 vmlal.u8 q4,d3,d25 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@ 625 vld1.u8 {d6},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 626 vmlal.u8 q4,d4,d26 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@ 627 vld1.u8 {d7},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@ 628 vmlsl.u8 q4,d5,d27 @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@ 629 vld1.u8 {d16},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@ 630 vmlal.u8 q4,d6,d28 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@ 631 vld1.u8 {d17},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 632 vmlsl.u8 q4,d7,d29 @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@ 633 634 635 addle r0,r0,r8 636 vmull.u8 q5,d2,d23 @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@ 637 638 bicle r4,r5,#7 @r5 ->wd 639 vmlsl.u8 q5,d1,d22 @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@ 640 641 vld1.u8 {d18},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 642 vmlsl.u8 q5,d3,d24 @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@ 643 644 pld [r3] 645 vmlal.u8 q5,d4,d25 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@ 646 pld [r3, r2] 647 vmlal.u8 q5,d5,d26 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@ 648 pld [r3, r2, lsl #1] 649 vmlsl.u8 q5,d6,d27 @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@ 650 add r3, r3, r2 651 vmlal.u8 q5,d7,d28 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@ 652 pld [r3, r2, lsl #1] 653 vmlsl.u8 q5,d16,d29 @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@ 654 655 add r3,r0,r2 @pu1_src_tmp += src_strd@ 656 vmull.u8 q6,d3,d23 657 vld1.u8 {d1},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 658 vmlsl.u8 q6,d2,d22 659 vld1.u8 {d0},[r0]! @src_tmp1 = vld1_u8(pu1_src_tmp)@ 660 vmlsl.u8 q6,d4,d24 661 vld1.u8 {d2},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 662 vmlal.u8 q6,d5,d25 663 vmlal.u8 q6,d6,d26 664 vmlsl.u8 q6,d7,d27 665 vmlal.u8 q6,d16,d28 666 vmlsl.u8 q6,d17,d29 667 add r14,r1,r6 668 vst1.8 {d8, d9},[r1]! @vst1_u8(pu1_dst,sto_res)@ 669 @vqrshrun.s16 d10,q5,#6 @sto_res = vqmovun_s16(sto_res_tmp)@ 670 addle r1,r1,r9,lsl #1 671 672 vmull.u8 q7,d4,d23 673 subs r7,r7,#4 674 vmlsl.u8 q7,d3,d22 675 vmlsl.u8 q7,d5,d24 676 vmlal.u8 q7,d6,d25 677 vld1.u8 {d3},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@ 678 vmlal.u8 q7,d7,d26 679 vld1.u8 {d4},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@ 680 vmlsl.u8 q7,d16,d27 681 vld1.u8 {d5},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 682 vmlal.u8 q7,d17,d28 683 vld1.u8 {d6},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 684 vmlsl.u8 q7,d18,d29 685 vld1.u8 {d7},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@ 686 687 vst1.8 {d10, d11},[r14],r6 @vst1_u8(pu1_dst_tmp,sto_res)@ 688 @vqrshrun.s16 d12,q6,#6 689 690 691 blt epilog_end_16out 692 beq epilog_16out @jumps to epilog 693 694 kernel_8_16out: 695 696 subs r4,r4,#8 697 vmull.u8 q4,d1,d23 @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@ 698 699 addle r0,r0,r8 700 vmlsl.u8 q4,d0,d22 @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@ 701 702 vld1.u8 {d16},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@ 703 vmlsl.u8 q4,d2,d24 @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@ 704 705 vld1.u8 {d17},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 706 vmlal.u8 q4,d3,d25 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@ 707 708 bicle r4,r5,#7 @r5 ->wd 709 vmlal.u8 q4,d4,d26 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@ 710 711 vld1.u8 {d18},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 712 vmlsl.u8 q4,d5,d27 @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@ 713 714 vst1.8 {d12,d13},[r14],r6 715 vmlal.u8 q4,d6,d28 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@ 716 717 add r3,r0,r2 @pu1_src_tmp += src_strd@ 718 vmlsl.u8 q4,d7,d29 @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@ 719 720 721 @ and r11, r0, #31 722 vmull.u8 q5,d2,d23 @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@ 723 724 vst1.8 {d14,d15},[r14],r6 725 vmlsl.u8 q5,d1,d22 @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@ 726 727 add r14,r1,r6 728 vmlsl.u8 q5,d3,d24 @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@ 729 730 vld1.u8 {d0},[r0]! @src_tmp1 = vld1_u8(pu1_src_tmp)@ 731 vmlal.u8 q5,d4,d25 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@ 732 733 vld1.u8 {d1},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 734 vmlal.u8 q5,d5,d26 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@ 735 736 vst1.8 {d8,d9},[r1]! @vst1_u8(pu1_dst,sto_res)@ 737 vmlsl.u8 q5,d6,d27 @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@ 738 739 addle r1,r1,r9,lsl #1 740 vmlal.u8 q5,d7,d28 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@ 741 742 @ cmp r11, r10 743 vmlsl.u8 q5,d16,d29 @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@ 744 745 add r10, r3, r2, lsl #3 @ 10*strd - 8+2 746 vmull.u8 q6,d3,d23 747 748 add r10, r10, r2 @ 11*strd 749 vmlsl.u8 q6,d2,d22 750 751 pld [r10] @11+ 0 752 vmlsl.u8 q6,d4,d24 753 754 pld [r10, r2] @11+ 1*strd 755 vmlal.u8 q6,d5,d25 756 757 pld [r10, r2, lsl #1] @11+ 2*strd 758 vmlal.u8 q6,d6,d26 759 760 add r10, r10, r2 @12*strd 761 vmlsl.u8 q6,d7,d27 762 763 pld [r10, r2, lsl #1] @11+ 3*strd 764 vmlal.u8 q6,d16,d28 765 766 @ mov r10, r11 767 vmlsl.u8 q6,d17,d29 768 769 vld1.u8 {d2},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 770 vmull.u8 q7,d4,d23 771 772 subs r7,r7,#4 773 vmlsl.u8 q7,d3,d22 774 775 vst1.8 {d10, d11},[r14],r6 @vst1_u8(pu1_dst_tmp,sto_res)@ 776 vmlsl.u8 q7,d5,d24 777 778 vld1.u8 {d3},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@ 779 vmlal.u8 q7,d6,d25 780 781 vld1.u8 {d4},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@ 782 vmlal.u8 q7,d7,d26 783 784 vld1.u8 {d5},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 785 vmlsl.u8 q7,d16,d27 786 787 vld1.u8 {d6},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 788 vmlal.u8 q7,d17,d28 789 790 vld1.u8 {d7},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@ 791 vmlsl.u8 q7,d18,d29 792 793 794 bgt kernel_8_16out @jumps to kernel_8 795 796 epilog_16out: 797 798 vmull.u8 q4,d1,d23 @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@ 799 vmlsl.u8 q4,d0,d22 @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@ 800 vmlsl.u8 q4,d2,d24 @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@ 801 vmlal.u8 q4,d3,d25 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@ 802 vmlal.u8 q4,d4,d26 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@ 803 vmlsl.u8 q4,d5,d27 @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@ 804 vmlal.u8 q4,d6,d28 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@ 805 vmlsl.u8 q4,d7,d29 @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@ 806 vst1.8 {d12,d13},[r14],r6 807 808 @vqrshrun.s16 d14,q7,#6 809 810 vld1.u8 {d16},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@ 811 vmull.u8 q5,d2,d23 @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@ 812 vmlsl.u8 q5,d1,d22 @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@ 813 vmlsl.u8 q5,d3,d24 @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@ 814 vmlal.u8 q5,d4,d25 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@ 815 vmlal.u8 q5,d5,d26 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@ 816 vmlsl.u8 q5,d6,d27 @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@ 817 vmlal.u8 q5,d7,d28 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@ 818 vmlsl.u8 q5,d16,d29 @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@ 819 vst1.8 {d14,d15},[r14],r6 820 821 @vqrshrun.s16 d8,q4,#6 @sto_res = vqmovun_s16(sto_res_tmp)@ 822 823 vld1.u8 {d17},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 824 vmull.u8 q6,d3,d23 825 vmlsl.u8 q6,d2,d22 826 vmlsl.u8 q6,d4,d24 827 vmlal.u8 q6,d5,d25 828 vmlal.u8 q6,d6,d26 829 vmlsl.u8 q6,d7,d27 830 vmlal.u8 q6,d16,d28 831 vmlsl.u8 q6,d17,d29 832 add r14,r1,r6 833 vst1.8 {d8,d9},[r1]! @vst1_u8(pu1_dst,sto_res)@ 834 @vqrshrun.s16 d10,q5,#6 @sto_res = vqmovun_s16(sto_res_tmp)@ 835 836 vld1.u8 {d18},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 837 vmull.u8 q7,d4,d23 838 vmlsl.u8 q7,d3,d22 839 vmlsl.u8 q7,d5,d24 840 vmlal.u8 q7,d6,d25 841 vmlal.u8 q7,d7,d26 842 vmlsl.u8 q7,d16,d27 843 vmlal.u8 q7,d17,d28 844 vmlsl.u8 q7,d18,d29 845 846 vst1.8 {d10,d11},[r14],r6 @vst1_u8(pu1_dst_tmp,sto_res)@ 847 @vqrshrun.s16 d12,q6,#6 848 849 epilog_end_16out: 850 vst1.8 {d12,d13},[r14],r6 851 @vqrshrun.s16 d14,q7,#6 852 853 vst1.8 {d14,d15},[r14],r6 854 855 856 end_loops_16out: 857 tst r5,#7 858 ldr r1, [sp], #4 859 ldr r0, [sp], #4 860 861 beq end2 862 863 mov r5, #4 864 add r0, r0, #8 865 add r1, r1, #16 866 mov r7, #16 867 mov r6, r6, lsr #1 868 869 @ 870 871 core_loop_wd_4_16out: 872 rsb r9,r5,r6,lsl #2 @r6->dst_strd r5 ->wd 873 rsb r8,r5,r2,lsl #2 @r2->src_strd 874 vmov.i8 d4,#0 875 mov r6, r6, lsl #1 876 877 outer_loop_wd_4_16out: 878 subs r12,r5,#0 879 ble end_inner_loop_wd_4_16out @outer loop jump 880 881 inner_loop_wd_4_16out: 882 add r3,r0,r2 883 vld1.u32 {d4[1]},[r3],r2 @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)@ 884 subs r12,r12,#4 885 vdup.u32 d5,d4[1] @src_tmp2 = vdup_lane_u32(src_tmp1, 1)@ 886 vld1.u32 {d5[1]},[r3],r2 @src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)@ 887 vld1.u32 {d4[0]},[r0] @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0)@ 888 vmull.u8 q0,d5,d23 @mul_res1 = vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)@ 889 890 vdup.u32 d6,d5[1] @src_tmp3 = vdup_lane_u32(src_tmp2, 1)@ 891 add r0,r0,#4 892 vld1.u32 {d6[1]},[r3],r2 @src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)@ 893 vmlsl.u8 q0,d4,d22 @mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)@ 894 895 vdup.u32 d7,d6[1] @src_tmp4 = vdup_lane_u32(src_tmp3, 1)@ 896 vld1.u32 {d7[1]},[r3],r2 @src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)@ 897 vmlsl.u8 q0,d6,d24 @mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)@ 898 899 vmull.u8 q4,d7,d23 900 vdup.u32 d4,d7[1] @src_tmp1 = vdup_lane_u32(src_tmp4, 1)@ 901 vmull.u8 q1,d7,d25 @mul_res2 = vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3)@ 902 vld1.u32 {d4[1]},[r3],r2 @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)@ 903 vmlsl.u8 q4,d6,d22 904 vmlal.u8 q0,d4,d26 @mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_4)@ 905 906 vdup.u32 d5,d4[1] @src_tmp2 = vdup_lane_u32(src_tmp1, 1)@ 907 vmlsl.u8 q4,d4,d24 908 vld1.u32 {d5[1]},[r3],r2 @src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)@ 909 vmlsl.u8 q1,d5,d27 @mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp2), coeffabs_5)@ 910 911 vdup.u32 d6,d5[1] @src_tmp3 = vdup_lane_u32(src_tmp2, 1)@ 912 vmlal.u8 q4,d5,d25 913 vld1.u32 {d6[1]},[r3],r2 @src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)@ 914 vmlal.u8 q0,d6,d28 @mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_6)@ 915 916 vdup.u32 d7,d6[1] @src_tmp4 = vdup_lane_u32(src_tmp3, 1)@ 917 vmlal.u8 q4,d6,d26 918 vld1.u32 {d7[1]},[r3],r2 @src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)@ 919 vmlsl.u8 q1,d7,d29 @mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp4), coeffabs_7)@ 920 921 vdup.u32 d4,d7[1] 922 vadd.i16 q0,q0,q1 @mul_res1 = vaddq_u16(mul_res1, mul_res2)@ 923 924 vmlsl.u8 q4,d7,d27 925 vld1.u32 {d4[1]},[r3],r2 926 vmlal.u8 q4,d4,d28 927 vdup.u32 d5,d4[1] 928 @vqrshrun.s16 d0,q0,#6 @sto_res = vqmovun_s16(sto_res_tmp)@ 929 930 vld1.u32 {d5[1]},[r3] 931 add r3,r1,r6 932 vst1.32 {d0},[r1]! @vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0)@ 933 934 vmlsl.u8 q4,d5,d29 935 vst1.32 {d1},[r3],r6 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1)@ 936 @vqrshrun.s16 d8,q4,#6 937 938 vst1.32 {d8},[r3],r6 939 @add r1,r1,#4 940 vst1.32 {d9},[r3] 941 bgt inner_loop_wd_4_16out 942 943 end_inner_loop_wd_4_16out: 944 subs r7,r7,#4 945 add r1,r1,r9,lsl #1 946 add r0,r0,r8 947 bgt outer_loop_wd_4_16out 948 end2: 949 vpop {d8 - d15} 950 ldmfd sp!, {r4-r12, r15} @reload the registers from sp 951 952 953 954 955 956 957 958 959 960