Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* @file
     21 @*  ihevc_inter_pred_chroma_horz_neon.s
     22 @*
     23 @* @brief
     24 @*  contains function definitions for inter prediction  interpolation.
     25 @* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 @* rvct
     28 @*
     29 @* @author
     30 @*  yogeswaran rs / akshaya mukund
     31 @*
     32 @* @par list of functions:
     33 @*
     34 @*
     35 @* @remarks
     36 @*  none
     37 @*
     38 @*******************************************************************************
     39 @*/
     40 @/**
     41 @*******************************************************************************
     42 @*
     43 @* @brief
     44 @*    chroma interprediction filter for horizontal input
     45 @*
     46 @* @par description:
     47 @*    applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
     48 @*    to the elements pointed by 'pu1_src' and  writes to the location pointed
     49 @*    by 'pu1_dst'  the output is downshifted by 6 and clipped to 8 bits
     50 @*    assumptions : the function is optimized considering the fact width is
     51 @*    multiple of 2,4 or 8. if width is 2, then height  should be multiple of 2.
     52 @*    width 4,8 is optimized further
     53 @*
     54 @* @param[in] pu1_src
     55 @*  uword8 pointer to the source
     56 @*
     57 @* @param[out] pu1_dst
     58 @*  uword8 pointer to the destination
     59 @*
     60 @* @param[in] src_strd
     61 @*  integer source stride
     62 @*
     63 @* @param[in] dst_strd
     64 @*  integer destination stride
     65 @*
     66 @* @param[in] pi1_coeff
     67 @*  word8 pointer to the filter coefficients
     68 @*
     69 @* @param[in] ht
     70 @*  integer height of the array
     71 @*
     72 @* @param[in] wd
     73 @*  integer width of the array
     74 @*
     75 @* @returns
     76 @*
     77 @* @remarks
     78 @*  none
     79 @*
     80 @*******************************************************************************
     81 @*/
     82 
     83 @void ihevc_inter_pred_chroma_horz(uword8 *pu1_src,
     84 @                                   uword8 *pu1_dst,
     85 @                                   word32 src_strd,
     86 @                                   word32 dst_strd,
     87 @                                   word8 *pi1_coeff,
     88 @                                   word32 ht,
     89 @                                   word32 wd)
     90 @**************variables vs registers*****************************************
     91 @r0 => *pu1_src
     92 @r1 => *pi2_dst
     93 @r2 =>  src_strd
     94 @r3 =>  dst_strd
     95 
     96 .text
     97 .align 4
     98 
     99 
    100 
    101 
    102 .globl ihevc_inter_pred_chroma_horz_a9q
    103 
    104 .type ihevc_inter_pred_chroma_horz_a9q, %function
    105 
    106 ihevc_inter_pred_chroma_horz_a9q:
    107 
    108     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    109 
    110     ldr         r4,[sp,#40]                 @loads pi1_coeff
    111     ldr         r7,[sp,#44]                 @loads ht
    112     ldr         r10,[sp,#48]                @loads wd
    113 
    114     vld1.8      {d0},[r4]                   @coeff = vld1_s8(pi1_coeff)
    115     subs        r14,r7,#0                   @checks for ht == 0
    116     vabs.s8     d2,d0                       @vabs_s8(coeff)
    117     mov         r11,#2
    118     ble         end_loops
    119 
    120     vdup.8      d24,d2[0]                   @coeffabs_0 = vdup_lane_u8(coeffabs, 0)
    121     sub         r12,r0,#2                   @pu1_src - 2
    122     vdup.8      d25,d2[1]                   @coeffabs_1 = vdup_lane_u8(coeffabs, 1)
    123     add         r4,r12,r2                   @pu1_src_tmp2_8 = pu1_src + src_strd
    124     vdup.8      d26,d2[2]                   @coeffabs_2 = vdup_lane_u8(coeffabs, 2)
    125 
    126     tst         r10,#3                      @checks wd for multiples
    127     mov         r5,r10,lsl #1
    128 
    129     vdup.8      d27,d2[3]                   @coeffabs_3 = vdup_lane_u8(coeffabs, 3)
    130 
    131     bne         outer_loop_4
    132     cmp         r10,#12
    133     beq         skip_16
    134 
    135     cmp         r10,#8
    136     bge         outer_loop_16
    137 skip_16:
    138     tst         r7,#3
    139 
    140     sub         r9,r0,#2
    141     beq         outer_loop_ht_4             @jumps to else condition
    142 
    143     b           outer_loop_8
    144 
    145 
    146 outer_loop_16:
    147     mov         r10,r5                      @2wd
    148     mul         r14,r14,r10
    149 
    150     rsb         r6,r3,#16
    151 
    152     add         r4,r12,r2
    153     mov         r9,#10
    154     and         r0, r12, #31
    155     rsb         r8,r5,r3,lsl #1
    156     pld         [r12, r2, lsl #1]
    157 
    158 
    159 
    160 
    161     vld1.u32    {q0},[r12],r11              @vector load pu1_src
    162     pld         [r4, r2, lsl #1]
    163     vld1.u32    {q1},[r12],r11              @vector load pu1_src
    164 
    165     vld1.u32    {q2},[r12],r11              @vector load pu1_src
    166 
    167     vld1.u32    {q3},[r12],r9               @vector load pu1_src
    168 
    169 
    170     vmull.u8    q15,d2,d25                  @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    171     vld1.u32    {q4},[r4],r11               @vector load pu1_src
    172     vmlsl.u8    q15,d0,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    173     vld1.u32    {q5},[r4],r11               @vector load pu1_src
    174     vmlal.u8    q15,d4,d26                  @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    175     vld1.u32    {q6},[r4],r11               @vector load pu1_src
    176     vmlsl.u8    q15,d6,d27                  @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    177     vld1.u32    {q7},[r4],r9                @vector load pu1_src
    178     vmull.u8    q14,d3,d25
    179 
    180     vmlsl.u8    q14,d1,d24
    181 
    182 
    183     vmlal.u8    q14,d5,d26
    184 
    185     vmlsl.u8    q14,d7,d27
    186 
    187 
    188     cmp         r14,#32
    189     beq         epilog_end
    190     sub         r14,#64
    191 
    192 inner_loop_16:
    193 
    194 
    195 
    196 
    197 @    bgt            l_2
    198 
    199 @   pld         [r12, r2, lsl #1]
    200 @   pld         [r4, r2, lsl #1]
    201 
    202     pld         [r12, r2, lsl #2]
    203     pld         [r4, r2, lsl #2]
    204 
    205     subs        r10,r10,#16
    206 
    207     vmull.u8    q11,d10,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    208 
    209 
    210     addeq       r12,r12,r8
    211     addeq       r4,r12,r2
    212     vmlsl.u8    q11,d8,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    213 
    214 
    215 
    216     vqrshrun.s16 d30,q15,#6
    217 
    218     vld1.u32    {q0},[r12],r11              @vector load pu1_src
    219     vqrshrun.s16 d31,q14,#6
    220 
    221 
    222     vld1.u32    {q1},[r12],r11              @vector load pu1_src
    223     vmlal.u8    q11,d12,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    224 
    225 
    226 
    227 
    228     vld1.u32    {q2},[r12],r11              @vector load pu1_src
    229     vmlsl.u8    q11,d14,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    230 
    231 
    232     vld1.u32    {q3},[r12],r9               @vector load pu1_src
    233     vmull.u8    q10,d11,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    234 
    235     vmlsl.u8    q10,d9,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    236 
    237     vst1.16     {q15}, [r1],r3
    238     vmlal.u8    q10,d13,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    239 
    240     vld1.u32    {q4},[r4],r11               @vector load pu1_src
    241     vmlsl.u8    q10,d15,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    242 
    243 
    244     vld1.u32    {q5},[r4],r11               @vector load pu1_src
    245     vmull.u8    q15,d2,d25                  @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    246 
    247     vld1.u32    {q6},[r4],r11               @vector load pu1_src
    248     vmlsl.u8    q15,d0,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    249 
    250     vld1.u32    {q7},[r4],r9                @vector load pu1_src
    251     vmlal.u8    q15,d4,d26                  @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    252 
    253     cmp         r10,#0
    254     vqrshrun.s16 d22,q11,#6
    255     vqrshrun.s16 d23,q10,#6
    256 
    257 
    258 
    259     vmlsl.u8    q15,d6,d27                  @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    260 
    261     moveq       r10,r5                      @2wd
    262     vmull.u8    q14,d3,d25
    263 
    264 
    265     vst1.16     {q11},[r1],r6               @store the result pu1_dst
    266     vmlsl.u8    q14,d1,d24
    267 
    268 
    269     addeq       r1,r1,r8
    270     vmlal.u8    q14,d5,d26
    271 
    272     subs        r14,r14,#32                 @decrement the ht loop
    273     vmlsl.u8    q14,d7,d27
    274 
    275 @     mov           r0, r7
    276 
    277     bgt         inner_loop_16
    278 
    279 
    280 
    281     add         r14,r14,#64
    282     cmp         r14,#32
    283     beq         epilog_end
    284 
    285 epilog:
    286     vqrshrun.s16 d30,q15,#6
    287     vqrshrun.s16 d31,q14,#6
    288 
    289 
    290 
    291     vst1.16     {q15}, [r1],r3
    292     vmull.u8    q11,d10,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    293 
    294 
    295 
    296 
    297     vmlsl.u8    q11,d8,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    298     subs        r10,r10,#16                 @decrement the wd loop
    299     vmlal.u8    q11,d12,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    300     addeq       r12,r12,r8
    301     vmlsl.u8    q11,d14,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    302     moveq       r10,r5                      @2wd
    303 
    304 
    305     addeq       r4,r12,r2
    306     vmull.u8    q10,d11,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    307     vld1.u32    {q0},[r12],r11              @vector load pu1_src
    308     vmlsl.u8    q10,d9,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    309     vld1.u32    {q1},[r12],r11              @vector load pu1_src
    310     vmlal.u8    q10,d13,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    311     vld1.u32    {q2},[r12],r11              @vector load pu1_src
    312     vmlsl.u8    q10,d15,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    313     vld1.u32    {q3},[r12],r9               @vector load pu1_src
    314     vmull.u8    q15,d2,d25                  @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    315 
    316 
    317     vld1.u32    {q4},[r4],r11               @vector load pu1_src
    318     vmlsl.u8    q15,d0,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    319     vld1.u32    {q5},[r4],r11               @vector load pu1_src
    320     vmlal.u8    q15,d4,d26                  @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    321 
    322     vmlsl.u8    q15,d6,d27                  @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    323 
    324     vld1.u32    {q6},[r4],r11               @vector load pu1_src
    325     vmull.u8    q14,d3,d25
    326     vld1.u32    {q7},[r4],r9                @vector load pu1_src
    327     vmlsl.u8    q14,d1,d24
    328     vqrshrun.s16 d22,q11,#6
    329     vqrshrun.s16 d23,q10,#6
    330 
    331     vst1.16     {q11},[r1],r6               @store the result pu1_dst
    332     vmlal.u8    q14,d5,d26
    333 
    334     vmlsl.u8    q14,d7,d27
    335     addeq       r1,r1,r8
    336 
    337 
    338 
    339 epilog_end:
    340     vqrshrun.s16 d30,q15,#6
    341     vqrshrun.s16 d31,q14,#6
    342 
    343 
    344     vmull.u8    q11,d10,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    345     vmlsl.u8    q11,d8,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    346     vmlal.u8    q11,d12,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    347     vmlsl.u8    q11,d14,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    348 
    349 
    350     vmull.u8    q10,d11,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    351     vmlsl.u8    q10,d9,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    352     vmlal.u8    q10,d13,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    353     vmlsl.u8    q10,d15,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    354     vqrshrun.s16 d22,q11,#6
    355     vqrshrun.s16 d23,q10,#6
    356 
    357 
    358     vst1.16     {q15}, [r1],r3
    359 
    360     vst1.16     {q11},[r1]                  @store the result pu1_dst
    361 
    362 
    363 
    364     b           end_loops
    365 
    366 
    367 
    368 
    369 
    370 
    371 
    372 
    373 
    374 
    375 
    376 
    377 
    378 
    379 
    380 
    381 
    382 
    383 
    384 outer_loop_8:
    385 
    386 
    387     add         r6,r1,r3                    @pu1_dst + dst_strd
    388     mov         r7,r5
    389     add         r4,r12,r2                   @pu1_src + src_strd
    390 
    391 
    392 inner_loop_8:
    393     @vld1.u32  {d0,d1},[r12],r11               @vector load pu1_src
    394     vld1.u32    {d0},[r12],r11              @vector load pu1_src
    395     vld1.u32    {d1},[r12],r11              @vector load pu1_src
    396     vld1.u32    {d2},[r12],r11              @vector load pu1_src
    397     vld1.u32    {d3},[r12],r11              @vector load pu1_src
    398 
    399     @vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
    400     vmull.u8    q4,d1,d25                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    401     vmlsl.u8    q4,d0,d24                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    402     @vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
    403     @vext.u8   d6,d0,d1,#6                     @vector extract of src[0_6]
    404     vmlal.u8    q4,d2,d26                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    405     vmlsl.u8    q4,d3,d27                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    406 
    407     vld1.u32    {d4},[r4],r11               @vector load pu1_src
    408     vld1.u32    {d5},[r4],r11               @vector load pu1_src
    409     vld1.u32    {d6},[r4],r11               @vector load pu1_src
    410     vld1.u32    {d7},[r4],r11               @vector load pu1_src
    411     @vld1.u32  {d12,d13},[r4],r11              @vector load pu1_src + src_strd
    412     @vext.u8   d14,d12,d13,#2                  @vector extract of src[0_2]
    413     vmull.u8    q5,d5,d25                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    414     vmlsl.u8    q5,d4,d24                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    415     @vext.u8   d16,d12,d13,#4                  @vector extract of src[0_4]
    416     @vext.u8   d18,d12,d13,#6                  @vector extract of src[0_6]
    417     vqrshrun.s16 d8,q4,#6                   @right shift and saturating narrow result 1
    418     vmlal.u8    q5,d6,d26                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    419     vmlsl.u8    q5,d7,d27                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    420 
    421     vst1.8      {d8},[r1]!                  @store the result pu1_dst
    422 
    423     vqrshrun.s16 d10,q5,#6                  @right shift and saturating narrow result 2
    424     subs        r7,r7,#8                    @decrement the wd loop
    425     vst1.8      {d10},[r6]!                 @store the result pu1_dst
    426     bgt         inner_loop_8
    427 
    428     sub         r12,r12,r5
    429     subs        r14,r14,#2                  @decrement the ht loop
    430     sub         r1,r1,r5
    431     add         r12,r12,r2,lsl #1
    432     add         r1,r1,r3,lsl #1
    433     bgt         outer_loop_8
    434     b           end_loops
    435 
    436 @height if 4 comes
    437 outer_loop_ht_4:
    438 
    439     mov         r7,r5
    440 
    441 prologue_ht_4:
    442 
    443 inner_loop_ht_4:
    444 
    445     mov         r12,r9
    446     mov         r4,r1
    447 
    448     sub         r8, r2, #6
    449 
    450     vld1.u32    {d0},[r12],r11              @(1)vector load pu1_src
    451     vld1.u32    {d1},[r12],r11              @(1)vector load pu1_src
    452     vld1.u32    {d2},[r12],r11              @(1)vector load pu1_src
    453     @vld1.u32  {d3},[r12],r2               @(1)vector load pu1_src
    454     vld1.u32    {d3},[r12],r8               @(1)vector load pu1_src
    455 
    456     @sub       r12, r12, #6                @(1)
    457 
    458     vld1.u32    {d4},[r12],r11              @(2)vector load pu1_src
    459     vld1.u32    {d5},[r12],r11              @(2)vector load pu1_src
    460     vld1.u32    {d6},[r12],r11              @(2)vector load pu1_src
    461     @vld1.u32  {d7},[r12],r2               @(2)vector load pu1_src
    462     vld1.u32    {d7},[r12],r8               @(2)vector load pu1_src
    463 
    464     @sub       r12, r12, #6                @(2)
    465 
    466     vld1.u32    {d14},[r12],r11             @(3)vector load pu1_src
    467     vmull.u8    q4,d1,d25                   @(1)mul_res = vmull_u8(src[0_3], coeffabs_3)@
    468 
    469     vld1.u32    {d15},[r12],r11             @(3)vector load pu1_src
    470     vmlsl.u8    q4,d0,d24                   @(1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    471 
    472     vld1.u32    {d16},[r12],r11             @(3)vector load pu1_src
    473     vmlal.u8    q4,d2,d26                   @(1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    474 
    475     @vld1.u32  {d17},[r12],r2              @(3)vector load pu1_src
    476     vld1.u32    {d17},[r12],r8              @(3)vector load pu1_src
    477     vmlsl.u8    q4,d3,d27                   @(1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    478 
    479     @sub       r12, r12, #6                @(3)
    480     vmull.u8    q5,d5,d25                   @(2)mul_res = vmull_u8(src[0_3], coeffabs_3)@
    481 
    482     vld1.u32    {d18},[r12],r11             @(4)vector load pu1_src
    483     vmlsl.u8    q5,d4,d24                   @(2)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    484 
    485     vld1.u32    {d19},[r12],r11             @(4)vector load pu1_src
    486     vmlal.u8    q5,d6,d26                   @(2)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    487 
    488     vld1.u32    {d20},[r12],r11             @(4)vector load pu1_src
    489     vmlsl.u8    q5,d7,d27                   @(2)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    490 
    491     vld1.u32    {d21},[r12],r2              @(4)vector load pu1_src
    492     vqrshrun.s16 d8,q4,#6                   @(1)right shift and saturating narrow result 1
    493 
    494     add         r9,r9,#8                    @(core loop)
    495 
    496     subs        r7,r7,#8                    @(prologue)decrement the wd loop
    497     beq         epilogue
    498 
    499 core_loop:
    500     mov         r12,r9
    501 
    502     vld1.u32    {d0},[r12],r11              @(1_1)vector load pu1_src
    503     vmull.u8    q6,d15,d25                  @(3)mul_res = vmull_u8(src[0_3], coeffabs_3)@
    504 
    505     vld1.u32    {d1},[r12],r11              @(1_1)vector load pu1_src
    506     vmlsl.u8    q6,d14,d24                  @(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    507 
    508     vld1.u32    {d2},[r12],r11              @(1_1)vector load pu1_src
    509     vmlal.u8    q6,d16,d26                  @(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    510 
    511     @vld1.u32  {d3},[r12],r2               @(1_1)vector load pu1_src
    512     vld1.u32    {d3},[r12],r8               @(1_1)vector load pu1_src
    513     vmlsl.u8    q6,d17,d27                  @(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    514 
    515     @sub       r12, r12, #6                @(1_1)
    516 
    517     vst1.8      {d8},[r4],r3                @(1)store the result pu1_dst
    518     vqrshrun.s16 d10,q5,#6                  @(2)right shift and saturating narrow result 2
    519 
    520     vld1.u32    {d4},[r12],r11              @(2_1)vector load pu1_src
    521     vmull.u8    q11,d19,d25                 @(4)mul_res = vmull_u8(src[0_3], coeffabs_3)@
    522 
    523     vld1.u32    {d5},[r12],r11              @(2_1)vector load pu1_src
    524     vmlsl.u8    q11,d18,d24                 @(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    525 
    526     vld1.u32    {d6},[r12],r11              @(2_1)vector load pu1_src
    527     vmlal.u8    q11,d20,d26                 @(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    528 
    529     @vld1.u32  {d7},[r12],r2               @(2_1)vector load pu1_src
    530     vld1.u32    {d7},[r12],r8               @(2_1)vector load pu1_src
    531     vmlsl.u8    q11,d21,d27                 @(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    532 
    533     @sub       r12, r12, #6                @(2_1)
    534 
    535     vst1.8      {d10},[r4],r3               @(2)store the result pu1_dst
    536     vqrshrun.s16 d12,q6,#6                  @(3)right shift and saturating narrow result 1
    537 
    538     vld1.u32    {d14},[r12],r11             @(3_1)vector load pu1_src
    539     vmull.u8    q4,d1,d25                   @(1_1)mul_res = vmull_u8(src[0_3], coeffabs_3)@
    540 
    541     vld1.u32    {d15},[r12],r11             @(3_1)vector load pu1_src
    542     vmlsl.u8    q4,d0,d24                   @(1_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    543 
    544     vld1.u32    {d16},[r12],r11             @(3_1)vector load pu1_src
    545     vmlal.u8    q4,d2,d26                   @(1_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    546 
    547     @vld1.u32  {d17},[r12],r2              @(3_1)vector load pu1_src
    548     vld1.u32    {d17},[r12],r8              @(3_1)vector load pu1_src
    549     vmlsl.u8    q4,d3,d27                   @(1_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    550 
    551     @sub       r12, r12, #6                @(3_1)
    552 
    553     vst1.8      {d12},[r4],r3               @(3)store the result pu1_dst
    554     vqrshrun.s16 d22,q11,#6                 @(4)right shift and saturating narrow result 2
    555 
    556     add         r9,r9,#8                    @(core loop)
    557 
    558     vmull.u8    q5,d5,d25                   @(2_1)mul_res = vmull_u8(src[0_3], coeffabs_3)@
    559     vld1.u32    {d18},[r12],r11             @(4_1)vector load pu1_src
    560 
    561     vld1.u32    {d19},[r12],r11             @(4_1)vector load pu1_src
    562     vmlsl.u8    q5,d4,d24                   @(2_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    563 
    564     vld1.u32    {d20},[r12],r11             @(4_1)vector load pu1_src
    565     vmlal.u8    q5,d6,d26                   @(2_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    566 
    567     vld1.u32    {d21},[r12],r2              @(4_1)vector load pu1_src
    568     vmlsl.u8    q5,d7,d27                   @(2_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    569 
    570     add         r1,r1,#8                    @(core loop)
    571 
    572     subs        r7,r7,#8                    @(core loop)
    573 
    574     vst1.8      {d22}, [r4], r3             @(4)store the result pu1_dst
    575     vqrshrun.s16 d8,q4,#6                   @(1_1)right shift and saturating narrow result 1
    576 
    577     mov         r4, r1                      @(core loop)
    578 
    579     bgt         core_loop                   @loopback
    580 
    581 epilogue:
    582     vmull.u8    q6,d15,d25                  @(3)mul_res = vmull_u8(src[0_3], coeffabs_3)@
    583 
    584     vmlsl.u8    q6,d14,d24                  @(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    585 
    586     vmlal.u8    q6,d16,d26                  @(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    587 
    588     vmlsl.u8    q6,d17,d27                  @(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    589 
    590     vst1.8      {d8},[r4],r3                @(1)store the result pu1_dst
    591     vqrshrun.s16 d10,q5,#6                  @(2)right shift and saturating narrow result 2
    592 
    593     vmull.u8    q11,d19,d25                 @(4)mul_res = vmull_u8(src[0_3], coeffabs_3)@
    594     vmlsl.u8    q11,d18,d24                 @(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    595 
    596     vmlal.u8    q11,d20,d26                 @(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    597 
    598     vmlsl.u8    q11,d21,d27                 @(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    599 
    600     vst1.8      {d10},[r4],r3               @(2)store the result pu1_dst
    601     vqrshrun.s16 d12,q6,#6                  @(3)right shift and saturating narrow result 1
    602 
    603     vst1.8      {d12},[r4],r3               @(3)store the result pu1_dst
    604 
    605     add         r1,r1,#8                    @(core loop)
    606 
    607     vqrshrun.s16 d22,q11,#6                 @(4)right shift and saturating narrow result 2
    608 
    609 
    610     vst1.8      {d22}, [r4], r3             @(4)store the result pu1_dst
    611 
    612     sub         r9,r9,r5
    613     subs        r14,r14,#4                  @decrement the ht loop
    614     sub         r1,r1,r5
    615     add         r9,r9,r2,lsl #2
    616     add         r1,r1,r3,lsl #2
    617     bgt         outer_loop_ht_4
    618     b           end_loops
    619 
    620 outer_loop_4:
    621     add         r6,r1,r3                    @pu1_dst + dst_strd
    622     mov         r7,r5
    623     add         r4,r12,r2                   @pu1_src + src_strd
    624 
    625 inner_loop_4:
    626     @vld1.u32  {d0,d1},[r12]                   @vector load pu1_src
    627 
    628     vld1.u32    {d0},[r12],r11              @vector load pu1_src
    629     vld1.u32    {d1},[r12],r11              @vector load pu1_src
    630     vld1.u32    {d2},[r12],r11              @vector load pu1_src
    631     vld1.u32    {d3},[r12]                  @vector load pu1_src
    632 
    633     sub         r12,r12,#2                  @increment the input pointer
    634     vld1.u32    {d4},[r4],r11               @vector load pu1_src
    635     vld1.u32    {d5},[r4],r11               @vector load pu1_src
    636     vld1.u32    {d6},[r4],r11               @vector load pu1_src
    637     vld1.u32    {d7},[r4]                   @vector load pu1_src
    638     @vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
    639     @vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
    640     @vld1.u32  {d12,d13},[r4]                  @vector load pu1_src + src_strd
    641     @vext.u8   d6,d0,d1,#6                     @vector extract of src[0_6]
    642 
    643     sub         r4,r4,#2                    @increment the input pointer
    644     @vext.u8   d14,d12,d13,#2                  @vector extract of src[0_2]
    645     @vext.u8   d16,d12,d13,#4                  @vector extract of src[0_4]
    646     @vext.u8   d18,d12,d13,#6                  @vector extract of src[0_6]
    647 
    648     vzip.32     d0,d4                       @vector zip the i iteration and ii interation in single register
    649     vzip.32     d1,d5
    650     vzip.32     d2,d6
    651     vzip.32     d3,d7
    652 
    653     vmull.u8    q4,d1,d25                   @arithmetic operations for ii iteration in the same time
    654     vmlsl.u8    q4,d0,d24
    655     vmlal.u8    q4,d2,d26
    656     vmlsl.u8    q4,d3,d27
    657 
    658     vqrshrun.s16 d8,q4,#6                   @narrow right shift and saturating the result
    659     vst1.32     {d8[0]},[r1]!               @store the i iteration result which is in upper part of the register
    660     subs        r7,r7,#4                    @decrement the wd by 4
    661 
    662     vst1.32     {d8[1]},[r6]!               @store the ii iteration result which is in lower part of the register
    663 
    664     bgt         inner_loop_4
    665 
    666     sub         r12,r12,r5
    667     subs        r14,r14,#2                  @decrement the ht by 2
    668     sub         r1,r1,r5
    669     add         r12,r12,r2,lsl #1
    670     add         r1,r1,r3,lsl #1
    671     bgt         outer_loop_4
    672 
    673 end_loops:
    674 
    675     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    676 
    677 
    678 
    679 
    680 
    681 
    682 
    683 
    684