Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* @file
     21 @*  ihevc_inter_pred_chroma_horz_neon.s
     22 @*
     23 @* @brief
     24 @*  contains function definitions for inter prediction  interpolation.
     25 @* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 @* rvct
     28 @*
     29 @* @author
     30 @*  yogeswaran rs / akshaya mukund
     31 @*
     32 @* @par list of functions:
     33 @*
     34 @*
     35 @* @remarks
     36 @*  none
     37 @*
     38 @*******************************************************************************
     39 @*/
     40 @/**
     41 @*******************************************************************************
     42 @*
     43 @* @brief
     44 @*       chroma interprediction filter to store horizontal 16bit ouput
     45 @*
     46 @* @par description:
     47 @*    applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
     48 @*    to the elements pointed by 'pu1_src' and  writes to the location pointed
     49 @*    by 'pu1_dst'  no downshifting or clipping is done and the output is  used
     50 @*    as an input for vertical filtering or weighted  prediction
     51 @*
     52 @* @param[in] pu1_src
     53 @*  uword8 pointer to the source
     54 @*
     55 @* @param[out] pi2_dst
     56 @*  word16 pointer to the destination
     57 @*
     58 @* @param[in] src_strd
     59 @*  integer source stride
     60 @*
     61 @* @param[in] dst_strd
     62 @*  integer destination stride
     63 @*
     64 @* @param[in] pi1_coeff
     65 @*  word8 pointer to the filter coefficients
     66 @*
     67 @* @param[in] ht
     68 @*  integer height of the array
     69 @*
     70 @* @param[in] wd
     71 @*  integer width of the array
     72 @*
     73 @* @returns
     74 @*
     75 @* @remarks
     76 @*  none
     77 @*
     78 @*******************************************************************************
     79 @*/
     80 @void ihevc_inter_pred_chroma_horz_w16out(uword8 *pu1_src,
     81 @                                          word16 *pi2_dst,
     82 @                                          word32 src_strd,
     83 @                                          word32 dst_strd,
     84 @                                          word8 *pi1_coeff,
     85 @                                          word32 ht,
     86 @                                          word32 wd)
     87 @**************variables vs registers*****************************************
     88 @r0 => *pu1_src
     89 @r1 => *pi2_dst
     90 @r2 =>  src_strd
     91 @r3 =>  dst_strd
     92 
     93 
     94 .text
     95 .align 4
     96 
     97 
     98 
     99 
    100 .globl ihevc_inter_pred_chroma_horz_w16out_a9q
    101 
    102 
    103 .type ihevc_inter_pred_chroma_horz_w16out_a9q, %function
    104 
    105 ihevc_inter_pred_chroma_horz_w16out_a9q:
    106 
    107     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    108 
    109     ldr         r4,[sp,#40]                 @loads pi1_coeff
    110     ldr         r6,[sp,#44]                 @loads ht
    111     ldr         r10,[sp,#48]                @loads wd
    112 
    113     vld1.8      {d0},[r4]                   @coeff = vld1_s8(pi1_coeff)
    114     subs        r14,r6,#0                   @checks for ht == 0
    115     vabs.s8     d2,d0                       @vabs_s8(coeff)
    116 
    117 @******* added
    118     mov         r11, #2
    119 @******* added ends
    120 
    121     ble         end_loops
    122 
    123     vdup.8      d24,d2[0]                   @coeffabs_0 = vdup_lane_u8(coeffabs, 0)
    124     sub         r12,r0,#2                   @pu1_src - 2
    125     vdup.8      d25,d2[1]                   @coeffabs_1 = vdup_lane_u8(coeffabs, 1)
    126     add         r4,r12,r2                   @pu1_src_tmp2_8 = pu1_src + src_strd
    127     vdup.8      d26,d2[2]                   @coeffabs_2 = vdup_lane_u8(coeffabs, 2)
    128 
    129     tst         r10,#3                      @checks wd for multiples of 4
    130     mov         r5,r10,lsl #1               @2wd
    131 
    132     vdup.8      d27,d2[3]                   @coeffabs_3 = vdup_lane_u8(coeffabs, 3)
    133 
    134     and         r7,r14,#1                   @added              @calculating ht_residue ht_residue = (ht & 1)
    135     sub         r14,r14,r7                  @added              @decrement height by ht_residue(residue value is calculated outside)
    136 
    137     bne         outer_loop_4                @ this branching happens when the width is 2 or 6
    138 
    139     cmp         r10,#12
    140     beq         skip_16
    141 
    142     cmp         r10,#8
    143     bge         outer_loop_16
    144 
    145 skip_16:
    146     tst         r6,#3
    147 
    148 @******* removal
    149     @mov       r11,#8
    150 @******* removal ends
    151 
    152     sub         r9,r0,#2
    153     beq         outer_loop_ht_4             @this branching happens when the height is a a multiple of 4
    154 
    155 
    156 
    157 @    cmp        r10,#12
    158 @    beq    outer_loop_8
    159 @    cmp        r10,#16
    160 @    bge    outer_loop_16
    161     b           outer_loop_8
    162 
    163 
    164 
    165 outer_loop_16:
    166     add         r4,r12,r2
    167 
    168 
    169     and         r0, r12, #31
    170     pld         [r12, r2, lsl #1]
    171 
    172 
    173 
    174 
    175 
    176 
    177 
    178     vld1.u32    {q0},[r12],r11              @vector load pu1_src
    179     mov         r10,r5                      @2wd
    180     mul         r14,r14,r10
    181     vld1.u32    {q1},[r12],r11              @vector load pu1_src
    182     pld         [r4, r2, lsl #1]
    183     mov         r9,#10
    184     vld1.u32    {q2},[r12],r11              @vector load pu1_src
    185     rsb         r6,r3,#8
    186     sub         r8,r3,#8
    187     vld1.u32    {q3},[r12],r9               @vector load pu1_src
    188 
    189 
    190     vmull.u8    q15,d2,d25                  @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    191     vld1.u32    {q4},[r4],r11               @vector load pu1_src
    192     vmlsl.u8    q15,d0,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    193     vld1.u32    {q5},[r4],r11               @vector load pu1_src
    194     vmlal.u8    q15,d4,d26                  @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    195     vld1.u32    {q6},[r4],r11               @vector load pu1_src
    196     vmlsl.u8    q15,d6,d27                  @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    197     vld1.u32    {q7},[r4],r9                @vector load pu1_src
    198     vmull.u8    q14,d3,d25
    199     lsl         r6,#1
    200     rsb         r3,r5,r3,lsl #1
    201     vmlsl.u8    q14,d1,d24
    202     lsl         r8,#1
    203     rsb         r7,r5,r2,lsl #1
    204     vmlal.u8    q14,d5,d26
    205 
    206     vmlsl.u8    q14,d7,d27
    207     cmp         r14,#32
    208     beq         epilog_end
    209     sub         r14,#64
    210 
    211 inner_loop_16:
    212 
    213     @ and           r7, r12, #31                    @decrement the wd loop
    214     @ cmp           r7, r0
    215     pld         [r12, r2, lsl #2]
    216     pld         [r4, r2, lsl #2]
    217 
    218 
    219     subs        r10,r10,#16
    220 
    221     vmull.u8    q11,d10,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    222 
    223 
    224 
    225 @    addeq      r12,r12,r2,lsl #1
    226 @    subeq      r12,r12,r5
    227     addeq       r12,r12,r7
    228     addeq       r4,r12,r2
    229 
    230 
    231     vst1.16     {q15}, [r1]!
    232     vmlsl.u8    q11,d8,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    233 
    234 
    235 
    236 
    237 
    238     vld1.u32    {q0},[r12],r11              @vector load pu1_src
    239     vmlal.u8    q11,d12,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    240 
    241 
    242 
    243 
    244     vld1.u32    {q1},[r12],r11              @vector load pu1_src
    245     vmlsl.u8    q11,d14,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    246 
    247 
    248     vld1.u32    {q2},[r12],r11              @vector load pu1_src
    249     vmull.u8    q10,d11,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    250 
    251     vst1.16     {q14}, [r1],r8
    252     vmlsl.u8    q10,d9,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    253 
    254     vld1.u32    {q3},[r12],r9               @vector load pu1_src
    255     vmlal.u8    q10,d13,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    256 
    257     vld1.u32    {q4},[r4],r11               @vector load pu1_src
    258     vmlsl.u8    q10,d15,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    259 
    260 
    261     vld1.u32    {q5},[r4],r11               @vector load pu1_src
    262     vmull.u8    q15,d2,d25                  @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    263 
    264     vld1.u32    {q6},[r4],r11               @vector load pu1_src
    265     vmlsl.u8    q15,d0,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    266 
    267     vld1.u32    {q7},[r4],r9                @vector load pu1_src
    268     vmlal.u8    q15,d4,d26                  @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    269 
    270     vst1.16     {q11},[r1]!                 @store the result pu1_dst
    271     vmlsl.u8    q15,d6,d27                  @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    272 
    273     moveq       r10,r5                      @2wd
    274     vmull.u8    q14,d3,d25
    275 
    276 
    277 
    278     vmlsl.u8    q14,d1,d24
    279     vst1.16     {q10},[r1],r6               @store the result pu1_dst
    280 
    281 
    282     addeq       r1,r1,r3,lsl #1
    283     vmlal.u8    q14,d5,d26
    284 
    285     subs        r14,r14,#32                 @decrement the ht loop
    286     vmlsl.u8    q14,d7,d27
    287 
    288 
    289 
    290 @    mov            r0, r7
    291     bgt         inner_loop_16
    292 
    293 
    294 
    295     add         r14,r14,#64
    296     cmp         r14,#32
    297     beq         epilog_end
    298 
    299 epilog:
    300 
    301     vst1.16     {q15}, [r1]!
    302     vmull.u8    q11,d10,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    303     vst1.16     {q14}, [r1],r8
    304 
    305 
    306 
    307     vmlsl.u8    q11,d8,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    308     subs        r10,r10,#16                 @decrement the wd loop
    309     vmlal.u8    q11,d12,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    310 @    addeq      r12,r12,r2,lsl #1
    311     addeq       r12,r12,r7
    312     vmlsl.u8    q11,d14,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    313     @ subeq     r12,r12,r5
    314     moveq       r10,r5                      @2wd
    315     addeq       r4,r12,r2
    316     vmull.u8    q10,d11,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    317     vld1.u32    {q0},[r12],r11              @vector load pu1_src
    318     vmlsl.u8    q10,d9,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    319     vld1.u32    {q1},[r12],r11              @vector load pu1_src
    320     vmlal.u8    q10,d13,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    321     vld1.u32    {q2},[r12],r11              @vector load pu1_src
    322     vmlsl.u8    q10,d15,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    323     vld1.u32    {q3},[r12],r9               @vector load pu1_src
    324     vmull.u8    q15,d2,d25                  @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    325 
    326 
    327     vld1.u32    {q4},[r4],r11               @vector load pu1_src
    328     vmlsl.u8    q15,d0,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    329     vld1.u32    {q5},[r4],r11               @vector load pu1_src
    330     vmlal.u8    q15,d4,d26                  @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    331 
    332     vmlsl.u8    q15,d6,d27                  @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    333 
    334     vld1.u32    {q6},[r4],r11               @vector load pu1_src
    335     vmull.u8    q14,d3,d25
    336     vld1.u32    {q7},[r4],r9                @vector load pu1_src
    337     vmlsl.u8    q14,d1,d24
    338     vst1.16     {q11},[r1]!                 @store the result pu1_dst
    339     vmlal.u8    q14,d5,d26
    340     vst1.16     {q10},[r1],r6               @store the result pu1_dst
    341     vmlsl.u8    q14,d7,d27
    342     addeq       r1,r1,r3,lsl #1
    343 
    344 
    345 epilog_end:
    346 
    347     vmull.u8    q11,d10,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    348     vmlsl.u8    q11,d8,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    349     vmlal.u8    q11,d12,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    350     vmlsl.u8    q11,d14,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    351 
    352 
    353     vmull.u8    q10,d11,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    354     vmlsl.u8    q10,d9,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    355     vmlal.u8    q10,d13,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    356     vmlsl.u8    q10,d15,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    357 
    358 
    359     vst1.16     {q15}, [r1]!
    360     vst1.16     {q14}, [r1],r8
    361     vst1.16     {q11},[r1]!                 @store the result pu1_dst
    362     vst1.16     {q10},[r1],r6               @store the result pu1_dst
    363 
    364 
    365     ldr         r6,[sp,#44]                 @loads ht
    366 
    367     and         r7,r6,#1
    368 
    369     cmp         r7,#0
    370     mov         r10,r5
    371     addne       r12,r12,r2,lsl #1
    372     subne       r12,r12,r5
    373     addne       r1,r1,r3,lsl #1
    374 
    375 
    376     bgt         loop_residue_4
    377 
    378     b           end_loops
    379 
    380 
    381 
    382 
    383 outer_loop_8:
    384 
    385     add         r6,r1,r3,lsl #1             @pu1_dst + dst_strd
    386     mov         r10,r5                      @2wd
    387     add         r4,r12,r2                   @pu1_src + src_strd
    388 
    389 inner_loop_8:
    390     @vld1.u32  {d0,d1},[r12],r11               @vector load pu1_src
    391     vld1.u32    {d0},[r12],r11              @vector load pu1_src
    392     vld1.u32    {d1},[r12],r11              @vector load pu1_src
    393     vld1.u32    {d2},[r12],r11              @vector load pu1_src
    394     vld1.u32    {d3},[r12],r11              @vector load pu1_src
    395 
    396 
    397     @vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
    398     vmull.u8    q4,d1,d25                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    399     vmlsl.u8    q4,d0,d24                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    400     @vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
    401     @vext.u8   d6,d0,d1,#6                     @vector extract of src[0_6]
    402     vmlal.u8    q4,d2,d26                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    403     vmlsl.u8    q4,d3,d27                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    404 
    405     @vld1.u32  {d12,d13},[r4],r11              @vector load pu1_src + src_strd
    406     vld1.u32    {d4},[r4],r11               @vector load pu1_src
    407     vld1.u32    {d5},[r4],r11               @vector load pu1_src
    408     vld1.u32    {d6},[r4],r11               @vector load pu1_src
    409     vld1.u32    {d7},[r4],r11               @vector load pu1_src
    410     @vext.u8   d14,d12,d13,#2                  @vector extract of src[0_2]
    411     vmull.u8    q5,d5,d25                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    412     vmlsl.u8    q5,d4,d24                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    413     @vext.u8   d16,d12,d13,#4                  @vector extract of src[0_4]
    414     @vext.u8   d18,d12,d13,#6                  @vector extract of src[0_6]
    415     vmlal.u8    q5,d6,d26                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    416     vmlsl.u8    q5,d7,d27                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    417 
    418     vst1.16     {d8, d9}, [r1]!
    419 
    420     subs        r10,r10,#8                  @decrement the wd loop
    421     vst1.16     {d10, d11},[r6]!            @store the result pu1_dst
    422     bgt         inner_loop_8
    423 
    424     sub         r12,r12,r5
    425     subs        r14,r14,#2                  @decrement the ht loop
    426     sub         r1,r1,r5,lsl #1
    427     add         r12,r12,r2,lsl #1
    428     add         r1,r1,r3,lsl #2
    429     bgt         outer_loop_8
    430 
    431     cmp         r7,#0
    432     mov         r10,r5
    433     bgt         loop_residue_4
    434 
    435     b           end_loops
    436 
    437 
    438 
    439 @height if 4 comes
    440 outer_loop_ht_4:
    441 
    442     mov         r10,r5
    443 
    444 prologue_ht_4:
    445     mov         r8,r3,lsl #1
    446 
    447 inner_loop_ht_4:
    448 
    449     mov         r12,r9
    450     mov         r4,r1
    451 
    452     sub         r0, r2, #6                  @ not sure if r0 needs to be preserved
    453 
    454     vld1.u32    {d0},[r12],r11              @(1)vector load pu1_src
    455     vld1.u32    {d1},[r12],r11              @(1)vector load pu1_src
    456     vld1.u32    {d2},[r12],r11              @(1)vector load pu1_src
    457     vld1.u32    {d3},[r12],r0               @(1)vector load pu1_src
    458 
    459     vld1.u32    {d4},[r12],r11              @(2)vector load pu1_src
    460     vld1.u32    {d5},[r12],r11              @(2)vector load pu1_src
    461     vld1.u32    {d6},[r12],r11              @(2)vector load pu1_src
    462     vld1.u32    {d7},[r12],r0               @(2)vector load pu1_src
    463 
    464     vld1.u32    {d14},[r12],r11             @(3)vector load pu1_src
    465     vmull.u8    q4,d1,d25                   @(1)mul_res = vmull_u8(src[0_3], coeffabs_3)@
    466 
    467     vld1.u32    {d15},[r12],r11             @(3)vector load pu1_src
    468     vmlsl.u8    q4,d0,d24                   @(1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    469 
    470     vld1.u32    {d16},[r12],r11             @(3)vector load pu1_src
    471     vmlal.u8    q4,d2,d26                   @(1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    472 
    473     vld1.u32    {d17},[r12],r0              @(3)vector load pu1_src
    474     vmlsl.u8    q4,d3,d27                   @(1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    475 
    476     vld1.u32    {d18},[r12],r11             @(4)vector load pu1_src
    477     vmull.u8    q5,d5,d25                   @(2)mul_res = vmull_u8(src[0_3], coeffabs_3)@
    478 
    479     vld1.u32    {d19},[r12],r11             @(4)vector load pu1_src
    480     vmlsl.u8    q5,d4,d24                   @(2)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    481 
    482     vld1.u32    {d20},[r12],r11             @(4)vector load pu1_src
    483     vmlal.u8    q5,d6,d26                   @(2)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    484 
    485     vld1.u32    {d21},[r12],r2              @(4)vector load pu1_src
    486     vmlsl.u8    q5,d7,d27                   @(2)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    487 
    488     add         r9,r9,#8                    @(core loop)
    489 
    490     subs        r10,r10,#8                  @(prologue)decrement the wd loop
    491     beq         epilogue
    492 
    493 core_loop:
    494     vst1.16     {d8, d9},[r4],r8            @(1)store the result pu1_dst
    495     mov         r12,r9
    496 
    497     vld1.u32    {d0},[r12],r11              @(1_1)vector load pu1_src
    498     vmull.u8    q6,d15,d25                  @(3)mul_res = vmull_u8(src[0_3], coeffabs_3)@
    499 
    500     vld1.u32    {d1},[r12],r11              @(1_1)vector load pu1_src
    501     vmlsl.u8    q6,d14,d24                  @(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    502 
    503     vld1.u32    {d2},[r12],r11              @(1_1)vector load pu1_src
    504     vmlal.u8    q6,d16,d26                  @(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    505 
    506     vld1.u32    {d3},[r12],r0               @(1_1)vector load pu1_src
    507     vmlsl.u8    q6,d17,d27                  @(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    508 
    509     vst1.16     {d10, d11},[r4],r8          @(2)store the result pu1_dst
    510     add         r9,r9,#8                    @(core loop)
    511 
    512     vld1.u32    {d4},[r12],r11              @(2_1)vector load pu1_src
    513     vmull.u8    q11,d19,d25                 @(4)mul_res = vmull_u8(src[0_3], coeffabs_3)@
    514 
    515     vld1.u32    {d5},[r12],r11              @(2_1)vector load pu1_src
    516     vmlsl.u8    q11,d18,d24                 @(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    517 
    518     vld1.u32    {d6},[r12],r11              @(2_1)vector load pu1_src
    519     vmlal.u8    q11,d20,d26                 @(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    520 
    521     vld1.u32    {d7},[r12],r0               @(2_1)vector load pu1_src
    522     vmlsl.u8    q11,d21,d27                 @(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    523 
    524     vst1.16     {d12, d13},[r4],r8          @(3)store the result pu1_dst
    525     add         r1,r1,#16                   @(core loop)
    526 
    527     vld1.u32    {d14},[r12],r11             @(3_1)vector load pu1_src
    528     vmull.u8    q4,d1,d25                   @(1_1)mul_res = vmull_u8(src[0_3], coeffabs_3)@
    529 
    530     vld1.u32    {d15},[r12],r11             @(3_1)vector load pu1_src
    531     vmlsl.u8    q4,d0,d24                   @(1_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    532 
    533     vld1.u32    {d16},[r12],r11             @(3_1)vector load pu1_src
    534     vmlal.u8    q4,d2,d26                   @(1_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    535 
    536     vld1.u32    {d17},[r12],r0              @(3_1)vector load pu1_src
    537     vmlsl.u8    q4,d3,d27                   @(1_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    538 
    539     vst1.16     {d22, d23}, [r4], r8        @(4)store the result pu1_dst
    540     subs        r10,r10,#8                  @(core loop)
    541 
    542     vmull.u8    q5,d5,d25                   @(2_1)mul_res = vmull_u8(src[0_3], coeffabs_3)@
    543     vld1.u32    {d18},[r12],r11             @(4_1)vector load pu1_src
    544 
    545     vld1.u32    {d19},[r12],r11             @(4_1)vector load pu1_src
    546     vmlsl.u8    q5,d4,d24                   @(2_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    547 
    548     vld1.u32    {d20},[r12],r11             @(4_1)vector load pu1_src
    549     vmlal.u8    q5,d6,d26                   @(2_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    550 
    551     mov         r4, r1                      @(core loop)
    552 
    553     vld1.u32    {d21},[r12],r0              @(4_1)vector load pu1_src
    554     vmlsl.u8    q5,d7,d27                   @(2_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    555 
    556 
    557 
    558     bgt         core_loop                   @loopback
    559 
    560 epilogue:
    561     vmull.u8    q6,d15,d25                  @(3)mul_res = vmull_u8(src[0_3], coeffabs_3)@
    562 
    563     vmlsl.u8    q6,d14,d24                  @(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    564 
    565     vmlal.u8    q6,d16,d26                  @(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    566 
    567     vmlsl.u8    q6,d17,d27                  @(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    568 
    569     vst1.16     {d8, d9},[r4], r8           @(1)store the result pu1_dst
    570 
    571     vmull.u8    q11,d19,d25                 @(4)mul_res = vmull_u8(src[0_3], coeffabs_3)@
    572     vmlsl.u8    q11,d18,d24                 @(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    573 
    574     vmlal.u8    q11,d20,d26                 @(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    575 
    576     vmlsl.u8    q11,d21,d27                 @(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    577 
    578     vst1.16     {d10, d11},[r4], r8         @(2)store the result pu1_dst
    579 
    580     vst1.16     {d12, d13},[r4], r8         @(3)store the result pu1_dst
    581 
    582     add         r1,r1,#16                   @(core loop)
    583 
    584     vst1.16     {d22, d23},[r4], r8         @(4)store the result pu1_dst
    585 
    586     sub         r9,r9,r5
    587     subs        r14,r14,#4                  @decrement the ht loop
    588     sub         r1,r1,r5,lsl #1
    589     add         r9,r9,r2,lsl #2
    590     add         r1,r1,r3,lsl #3
    591     bgt         outer_loop_ht_4
    592 
    593     cmp         r7,#0
    594     mov         r10,r5
    595     movgt       r12,r9
    596     movgt       r4,r1
    597     bgt         loop_residue_4
    598 
    599     b           end_loops
    600 
    601 outer_loop_4:
    602     add         r6,r1,r3,lsl #1             @pu1_dst + dst_strd
    603     mov         r10,r5
    604     add         r4,r12,r2                   @pu1_src + src_strd
    605 
    606 inner_loop_4:
    607     @vld1.u32  {d0,d1},[r12]                   @vector load pu1_src
    608     vld1.u32    {d0},[r12],r11              @vector load pu1_src
    609     vld1.u32    {d1},[r12],r11              @vector load pu1_src
    610     vld1.u32    {d2},[r12],r11              @vector load pu1_src
    611     vld1.u32    {d3},[r12]                  @vector load pu1_src
    612 
    613 @**** removal
    614     @add       r12,r12,#4                      @increment the input pointer
    615 @**** removal ends
    616 @**** addn
    617     sub         r12,r12,#2                  @increment the input pointer
    618 @**** addn ends
    619     vld1.u32    {d4},[r4],r11               @vector load pu1_src
    620     vld1.u32    {d5},[r4],r11               @vector load pu1_src
    621     vld1.u32    {d6},[r4],r11               @vector load pu1_src
    622     vld1.u32    {d7},[r4]                   @vector load pu1_src
    623     @vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
    624     @vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
    625     @vld1.u32  {d12,d13},[r4]                  @vector load pu1_src + src_strd
    626     @vext.u8   d6,d0,d1,#6                     @vector extract of src[0_6]
    627 
    628     @add       r4,r4,#4                        @increment the input pointer
    629     sub         r4,r4,#2
    630     @vext.u8   d14,d12,d13,#2                  @vector extract of src[0_2]
    631     @vext.u8   d16,d12,d13,#4                  @vector extract of src[0_4]
    632     @vext.u8   d18,d12,d13,#6                  @vector extract of src[0_6]
    633 
    634 @**** removal
    635     @vzip.32   d0,d12                          @vector zip the i iteration and ii interation in single register
    636     @vzip.32   d2,d14
    637     @vzip.32   d4,d16
    638     @vzip.32   d6,d18
    639 @**** removal ends
    640 @**** addn
    641     vzip.32     d0,d4                       @vector zip the i iteration and ii interation in single register
    642     vzip.32     d1,d5
    643     vzip.32     d2,d6
    644     vzip.32     d3,d7
    645 @**** addn ends
    646 
    647     vmull.u8    q4,d1,d25                   @arithmetic operations for ii iteration in the same time
    648     vmlsl.u8    q4,d0,d24
    649     vmlal.u8    q4,d2,d26
    650     vmlsl.u8    q4,d3,d27
    651 
    652     vst1.32     {d8},[r1]!                  @store the i iteration result which is in upper part of the register
    653     subs        r10,r10,#4                  @decrement the wd by 4
    654 
    655     vst1.32     {d9},[r6]!                  @store the ii iteration result which is in lower part of the register
    656 
    657     bgt         inner_loop_4
    658 
    659     sub         r12,r12,r5
    660     subs        r14,r14,#2                  @decrement the ht by 2
    661     sub         r1,r1,r5,lsl #1
    662     add         r12,r12,r2,lsl #1
    663     add         r1,r1,r3,lsl #2
    664     bgt         outer_loop_4
    665 
    666     cmp         r7,#0
    667     mov         r10,r5
    668     beq         end_loops
    669 
    670 loop_residue_4:
    671 
    672     mov         r10,r5                      @2wd
    673 
    674 loop_residue:
    675 
    676     @vld1.u32  {d0,d1},[r12]                   @vector load pu1_src
    677     vld1.u32    {d0},[r12],r11              @vector load pu1_src
    678     vld1.u32    {d1},[r12],r11              @vector load pu1_src
    679     vld1.u32    {d2},[r12],r11              @vector load pu1_src
    680     vld1.u32    {d3},[r12]                  @vector load pu1_src
    681     @vext.u8       d2,d0,d1,#2             @vector extract of src[0_2]
    682     @vmull.u8      q4,d2,d25               @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    683     @vmlsl.u8      q4,d0,d24               @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    684     @vext.u8       d4,d0,d1,#4             @vector extract of src[0_4]
    685     @add           r12,r12,#4              @pu1_src + 4
    686     sub         r12, r12, #2
    687     @vext.u8       d6,d0,d1,#6             @vector extract of src[0_6]
    688     @vmlal.u8      q4,d4,d26               @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    689     @vmlsl.u8      q4,d6,d27               @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    690     vmull.u8    q4,d1,d25
    691     vmlsl.u8    q4,d0,d24
    692     vmlal.u8    q4,d2,d26
    693     vmlsl.u8    q4,d3,d27
    694 
    695     vst1.64     {d8 },[r1]                  @store the result pu1_dst
    696     subs        r10,r10,#4                  @decrement the wd loop
    697     add         r1,r1,#8                    @pi2_dst + 8
    698 
    699     bgt         loop_residue                @loop again
    700 
    701     @inner loop ends
    702     @add           r8,r3,lsl #1            @2*dst_strd
    703     @sub           r8,r8,r5,lsl #1         @2*dst_strd - 2wd
    704     @sub           r9,r2,r5                @src_strd - 2wd
    705     @subs          r7,r7,#1                @decrement the ht loop
    706     @add           r12,r12,r9              @pu1_src + src_strd
    707     @add           r1,r1,r8                @pu1_dst + 2*dst_strd
    708     @bgt           outer_loop_residue_4    @loop again
    709     @b                 end_loops               @jumps to end
    710 
    711 end_loops:
    712 
    713     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    714 
    715 
    716 
    717 
    718 
    719 
    720