Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* @file
     21 @*  ihevc_inter_pred_chroma_horz_neon.s
     22 @*
     23 @* @brief
     24 @*  contains function definitions for inter prediction  interpolation.
     25 @* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 @* rvct
     28 @*
     29 @* @author
     30 @*  yogeswaran rs / akshaya mukund
     31 @*
     32 @* @par list of functions:
     33 @*
     34 @*
     35 @* @remarks
     36 @*  none
     37 @*
     38 @*******************************************************************************
     39 @*/
     40 @/**
     41 @*******************************************************************************
     42 @*
     43 @* @brief
     44 @*    chroma interprediction filter for horizontal input
     45 @*
     46 @* @par description:
     47 @*    applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
     48 @*    to the elements pointed by 'pu1_src' and  writes to the location pointed
     49 @*    by 'pu1_dst'  the output is downshifted by 6 and clipped to 8 bits
     50 @*    assumptions : the function is optimized considering the fact width is
     51 @*    multiple of 2,4 or 8. if width is 2, then height  should be multiple of 2.
     52 @*    width 4,8 is optimized further
     53 @*
     54 @* @param[in] pu1_src
     55 @*  uword8 pointer to the source
     56 @*
     57 @* @param[out] pu1_dst
     58 @*  uword8 pointer to the destination
     59 @*
     60 @* @param[in] src_strd
     61 @*  integer source stride
     62 @*
     63 @* @param[in] dst_strd
     64 @*  integer destination stride
     65 @*
     66 @* @param[in] pi1_coeff
     67 @*  word8 pointer to the filter coefficients
     68 @*
     69 @* @param[in] ht
     70 @*  integer height of the array
     71 @*
     72 @* @param[in] wd
     73 @*  integer width of the array
     74 @*
     75 @* @returns
     76 @*
     77 @* @remarks
     78 @*  none
     79 @*
     80 @*******************************************************************************
     81 @*/
     82 
     83 @void ihevc_inter_pred_chroma_horz(uword8 *pu1_src,
     84 @                                   uword8 *pu1_dst,
     85 @                                   word32 src_strd,
     86 @                                   word32 dst_strd,
     87 @                                   word8 *pi1_coeff,
     88 @                                   word32 ht,
     89 @                                   word32 wd)
     90 @**************variables vs registers*****************************************
     91 @r0 => *pu1_src
     92 @r1 => *pi2_dst
     93 @r2 =>  src_strd
     94 @r3 =>  dst_strd
     95 
     96 .text
     97 .align 4
     98 
     99 
    100 
    101 
    102 .globl ihevc_inter_pred_chroma_horz_a9q
    103 
    104 .type ihevc_inter_pred_chroma_horz_a9q, %function
    105 
    106 ihevc_inter_pred_chroma_horz_a9q:
    107 
    108     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    109 
    110     ldr         r4,[sp,#40]                 @loads pi1_coeff
    111     ldr         r7,[sp,#44]                 @loads ht
    112     ldr         r10,[sp,#48]                @loads wd
    113 
    114     vld1.8      {d0},[r4]                   @coeff = vld1_s8(pi1_coeff)
    115     subs        r14,r7,#0                   @checks for ht == 0
    116     vabs.s8     d2,d0                       @vabs_s8(coeff)
    117     mov         r11,#2
    118     ble         end_loops
    119 
    120     vdup.8      d24,d2[0]                   @coeffabs_0 = vdup_lane_u8(coeffabs, 0)
    121     sub         r12,r0,#2                   @pu1_src - 2
    122     vdup.8      d25,d2[1]                   @coeffabs_1 = vdup_lane_u8(coeffabs, 1)
    123     add         r4,r12,r2                   @pu1_src_tmp2_8 = pu1_src + src_strd
    124     vdup.8      d26,d2[2]                   @coeffabs_2 = vdup_lane_u8(coeffabs, 2)
    125 
    126     tst         r10,#3                      @checks wd for multiples
    127     mov         r5,r10,lsl #1
    128 
    129     vdup.8      d27,d2[3]                   @coeffabs_3 = vdup_lane_u8(coeffabs, 3)
    130 
    131     bne         outer_loop_4
    132     cmp         r10,#12
    133     beq         skip_16
    134 
    135     cmp         r10,#8
    136     bge         outer_loop_16
    137 skip_16:
    138     tst         r7,#3
    139 
    140     sub         r9,r0,#2
    141     beq         outer_loop_ht_4             @jumps to else condition
    142 
    143     b           outer_loop_8
    144 
    145 
    146 outer_loop_16:
    147     mov         r10,r5                      @2wd
    148     mul         r14,r14,r10
    149 
    150     rsb         r6,r3,#16
    151 
    152     add         r4,r12,r2
    153     mov         r9,#10
    154     and         r0, r12, #31
    155     rsb         r8,r5,r3,lsl #1
    156     pld         [r12, r2, lsl #1]
    157 
    158 
    159 
    160 
    161     vld1.u32    {q0},[r12],r11              @vector load pu1_src
    162     pld         [r4, r2, lsl #1]
    163     vld1.u32    {q1},[r12],r11              @vector load pu1_src
    164 
    165     vld1.u32    {q2},[r12],r11              @vector load pu1_src
    166 
    167     vld1.u32    {q3},[r12],r9               @vector load pu1_src
    168 
    169 
    170     vmull.u8    q15,d2,d25                  @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    171     vld1.u32    {q4},[r4],r11               @vector load pu1_src
    172     vmlsl.u8    q15,d0,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    173     vld1.u32    {q5},[r4],r11               @vector load pu1_src
    174     vmlal.u8    q15,d4,d26                  @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    175     vld1.u32    {q6},[r4],r11               @vector load pu1_src
    176     vmlsl.u8    q15,d6,d27                  @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    177     vld1.u32    {q7},[r4],r9                @vector load pu1_src
    178     vmull.u8    q14,d3,d25
    179 
    180     vmlsl.u8    q14,d1,d24
    181 
    182 
    183     vmlal.u8    q14,d5,d26
    184 
    185     vmlsl.u8    q14,d7,d27
    186 
    187 
    188     cmp         r14,#32
    189     beq         epilog_end
    190     sub         r14,#64
    191 
    192 inner_loop_16:
    193 
    194 
    195 
    196 
    197 @    bgt            l_2
    198 
    199 @   pld         [r12, r2, lsl #1]
    200 @   pld         [r4, r2, lsl #1]
    201 
    202 
    203 
    204     subs        r10,r10,#16
    205 
    206     vmull.u8    q11,d10,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    207 
    208 
    209     addeq       r12,r12,r8
    210     addeq       r4,r12,r2
    211     vmlsl.u8    q11,d8,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    212 
    213 
    214 
    215     pld         [r12, r2, lsl #2]
    216     vqrshrun.s16 d30,q15,#6
    217 
    218     vld1.u32    {q0},[r12],r11              @vector load pu1_src
    219     vqrshrun.s16 d31,q14,#6
    220 
    221 
    222     vld1.u32    {q1},[r12],r11              @vector load pu1_src
    223     vmlal.u8    q11,d12,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    224 
    225 
    226 
    227 
    228     vld1.u32    {q2},[r12],r11              @vector load pu1_src
    229     vmlsl.u8    q11,d14,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    230 
    231 
    232     vld1.u32    {q3},[r12],r9               @vector load pu1_src
    233     vmull.u8    q10,d11,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    234 
    235     pld         [r4, r2, lsl #2]
    236     vmlsl.u8    q10,d9,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    237 
    238     vst1.16     {q15}, [r1],r3
    239     vmlal.u8    q10,d13,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    240 
    241     vld1.u32    {q4},[r4],r11               @vector load pu1_src
    242     vmlsl.u8    q10,d15,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    243 
    244 
    245     vld1.u32    {q5},[r4],r11               @vector load pu1_src
    246     vmull.u8    q15,d2,d25                  @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    247 
    248     vld1.u32    {q6},[r4],r11               @vector load pu1_src
    249     vmlsl.u8    q15,d0,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    250 
    251     vld1.u32    {q7},[r4],r9                @vector load pu1_src
    252     vmlal.u8    q15,d4,d26                  @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    253 
    254     cmp         r10,#0
    255     vqrshrun.s16 d22,q11,#6
    256     vqrshrun.s16 d23,q10,#6
    257 
    258 
    259 
    260     vmlsl.u8    q15,d6,d27                  @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    261 
    262     moveq       r10,r5                      @2wd
    263     vmull.u8    q14,d3,d25
    264 
    265 
    266     vst1.16     {q11},[r1],r6               @store the result pu1_dst
    267     vmlsl.u8    q14,d1,d24
    268 
    269 
    270     addeq       r1,r1,r8
    271     vmlal.u8    q14,d5,d26
    272 
    273     subs        r14,r14,#32                 @decrement the ht loop
    274     vmlsl.u8    q14,d7,d27
    275 
    276 @     mov           r0, r7
    277 
    278     bgt         inner_loop_16
    279 
    280 
    281 
    282     add         r14,r14,#64
    283     cmp         r14,#32
    284     beq         epilog_end
    285 
    286 epilog:
    287     vqrshrun.s16 d30,q15,#6
    288     vqrshrun.s16 d31,q14,#6
    289 
    290 
    291 
    292     vst1.16     {q15}, [r1],r3
    293     vmull.u8    q11,d10,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    294 
    295 
    296 
    297 
    298     vmlsl.u8    q11,d8,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    299     subs        r10,r10,#16                 @decrement the wd loop
    300     vmlal.u8    q11,d12,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    301     addeq       r12,r12,r8
    302     vmlsl.u8    q11,d14,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    303     moveq       r10,r5                      @2wd
    304 
    305 
    306     addeq       r4,r12,r2
    307     vmull.u8    q10,d11,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    308     vld1.u32    {q0},[r12],r11              @vector load pu1_src
    309     vmlsl.u8    q10,d9,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    310     vld1.u32    {q1},[r12],r11              @vector load pu1_src
    311     vmlal.u8    q10,d13,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    312     vld1.u32    {q2},[r12],r11              @vector load pu1_src
    313     vmlsl.u8    q10,d15,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    314     vld1.u32    {q3},[r12],r9               @vector load pu1_src
    315     vmull.u8    q15,d2,d25                  @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    316 
    317 
    318     vld1.u32    {q4},[r4],r11               @vector load pu1_src
    319     vmlsl.u8    q15,d0,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    320     vld1.u32    {q5},[r4],r11               @vector load pu1_src
    321     vmlal.u8    q15,d4,d26                  @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    322 
    323     vmlsl.u8    q15,d6,d27                  @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    324 
    325     vld1.u32    {q6},[r4],r11               @vector load pu1_src
    326     vmull.u8    q14,d3,d25
    327     vld1.u32    {q7},[r4],r9                @vector load pu1_src
    328     vmlsl.u8    q14,d1,d24
    329     vqrshrun.s16 d22,q11,#6
    330     vqrshrun.s16 d23,q10,#6
    331 
    332     vst1.16     {q11},[r1],r6               @store the result pu1_dst
    333     vmlal.u8    q14,d5,d26
    334 
    335     vmlsl.u8    q14,d7,d27
    336     addeq       r1,r1,r8
    337 
    338 
    339 
    340 epilog_end:
    341     vqrshrun.s16 d30,q15,#6
    342     vqrshrun.s16 d31,q14,#6
    343 
    344 
    345     vmull.u8    q11,d10,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    346     vmlsl.u8    q11,d8,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    347     vmlal.u8    q11,d12,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    348     vmlsl.u8    q11,d14,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    349 
    350 
    351     vmull.u8    q10,d11,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    352     vmlsl.u8    q10,d9,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    353     vmlal.u8    q10,d13,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    354     vmlsl.u8    q10,d15,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    355     vqrshrun.s16 d22,q11,#6
    356     vqrshrun.s16 d23,q10,#6
    357 
    358 
    359     vst1.16     {q15}, [r1],r3
    360 
    361     vst1.16     {q11},[r1]                  @store the result pu1_dst
    362 
    363 
    364 
    365     b           end_loops
    366 
    367 
    368 
    369 
    370 
    371 
    372 
    373 
    374 
    375 
    376 
    377 
    378 
    379 
    380 
    381 
    382 
    383 
    384 
    385 outer_loop_8:
    386 
    387 
    388     add         r6,r1,r3                    @pu1_dst + dst_strd
    389     mov         r7,r5
    390     add         r4,r12,r2                   @pu1_src + src_strd
    391 
    392 
    393 inner_loop_8:
    394     @vld1.u32  {d0,d1},[r12],r11               @vector load pu1_src
    395     vld1.u32    {d0},[r12],r11              @vector load pu1_src
    396     vld1.u32    {d1},[r12],r11              @vector load pu1_src
    397     vld1.u32    {d2},[r12],r11              @vector load pu1_src
    398     vld1.u32    {d3},[r12],r11              @vector load pu1_src
    399 
    400     @vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
    401     vmull.u8    q4,d1,d25                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    402     vmlsl.u8    q4,d0,d24                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    403     @vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
    404     @vext.u8   d6,d0,d1,#6                     @vector extract of src[0_6]
    405     vmlal.u8    q4,d2,d26                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    406     vmlsl.u8    q4,d3,d27                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    407 
    408     vld1.u32    {d4},[r4],r11               @vector load pu1_src
    409     vld1.u32    {d5},[r4],r11               @vector load pu1_src
    410     vld1.u32    {d6},[r4],r11               @vector load pu1_src
    411     vld1.u32    {d7},[r4],r11               @vector load pu1_src
    412     @vld1.u32  {d12,d13},[r4],r11              @vector load pu1_src + src_strd
    413     @vext.u8   d14,d12,d13,#2                  @vector extract of src[0_2]
    414     vmull.u8    q5,d5,d25                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    415     vmlsl.u8    q5,d4,d24                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    416     @vext.u8   d16,d12,d13,#4                  @vector extract of src[0_4]
    417     @vext.u8   d18,d12,d13,#6                  @vector extract of src[0_6]
    418     vqrshrun.s16 d8,q4,#6                   @right shift and saturating narrow result 1
    419     vmlal.u8    q5,d6,d26                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    420     vmlsl.u8    q5,d7,d27                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    421 
    422     vst1.8      {d8},[r1]!                  @store the result pu1_dst
    423 
    424     vqrshrun.s16 d10,q5,#6                  @right shift and saturating narrow result 2
    425     subs        r7,r7,#8                    @decrement the wd loop
    426     vst1.8      {d10},[r6]!                 @store the result pu1_dst
    427     bgt         inner_loop_8
    428 
    429     sub         r12,r12,r5
    430     subs        r14,r14,#2                  @decrement the ht loop
    431     sub         r1,r1,r5
    432     add         r12,r12,r2,lsl #1
    433     add         r1,r1,r3,lsl #1
    434     bgt         outer_loop_8
    435     b           end_loops
    436 
    437 @height if 4 comes
    438 outer_loop_ht_4:
    439 
    440     mov         r7,r5
    441 
    442 prologue_ht_4:
    443 
    444 inner_loop_ht_4:
    445 
    446     mov         r12,r9
    447     mov         r4,r1
    448 
    449     sub         r8, r2, #6
    450 
    451     vld1.u32    {d0},[r12],r11              @(1)vector load pu1_src
    452     vld1.u32    {d1},[r12],r11              @(1)vector load pu1_src
    453     vld1.u32    {d2},[r12],r11              @(1)vector load pu1_src
    454     @vld1.u32  {d3},[r12],r2               @(1)vector load pu1_src
    455     vld1.u32    {d3},[r12],r8               @(1)vector load pu1_src
    456 
    457     @sub       r12, r12, #6                @(1)
    458 
    459     vld1.u32    {d4},[r12],r11              @(2)vector load pu1_src
    460     vld1.u32    {d5},[r12],r11              @(2)vector load pu1_src
    461     vld1.u32    {d6},[r12],r11              @(2)vector load pu1_src
    462     @vld1.u32  {d7},[r12],r2               @(2)vector load pu1_src
    463     vld1.u32    {d7},[r12],r8               @(2)vector load pu1_src
    464 
    465     @sub       r12, r12, #6                @(2)
    466 
    467     vld1.u32    {d14},[r12],r11             @(3)vector load pu1_src
    468     vmull.u8    q4,d1,d25                   @(1)mul_res = vmull_u8(src[0_3], coeffabs_3)@
    469 
    470     vld1.u32    {d15},[r12],r11             @(3)vector load pu1_src
    471     vmlsl.u8    q4,d0,d24                   @(1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    472 
    473     vld1.u32    {d16},[r12],r11             @(3)vector load pu1_src
    474     vmlal.u8    q4,d2,d26                   @(1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    475 
    476     @vld1.u32  {d17},[r12],r2              @(3)vector load pu1_src
    477     vld1.u32    {d17},[r12],r8              @(3)vector load pu1_src
    478     vmlsl.u8    q4,d3,d27                   @(1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    479 
    480     @sub       r12, r12, #6                @(3)
    481     vmull.u8    q5,d5,d25                   @(2)mul_res = vmull_u8(src[0_3], coeffabs_3)@
    482 
    483     vld1.u32    {d18},[r12],r11             @(4)vector load pu1_src
    484     vmlsl.u8    q5,d4,d24                   @(2)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    485 
    486     vld1.u32    {d19},[r12],r11             @(4)vector load pu1_src
    487     vmlal.u8    q5,d6,d26                   @(2)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    488 
    489     vld1.u32    {d20},[r12],r11             @(4)vector load pu1_src
    490     vmlsl.u8    q5,d7,d27                   @(2)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    491 
    492     vld1.u32    {d21},[r12],r2              @(4)vector load pu1_src
    493     vqrshrun.s16 d8,q4,#6                   @(1)right shift and saturating narrow result 1
    494 
    495     add         r9,r9,#8                    @(core loop)
    496 
    497     subs        r7,r7,#8                    @(prologue)decrement the wd loop
    498     beq         epilogue
    499 
    500 core_loop:
    501     mov         r12,r9
    502 
    503     vld1.u32    {d0},[r12],r11              @(1_1)vector load pu1_src
    504     vmull.u8    q6,d15,d25                  @(3)mul_res = vmull_u8(src[0_3], coeffabs_3)@
    505 
    506     vld1.u32    {d1},[r12],r11              @(1_1)vector load pu1_src
    507     vmlsl.u8    q6,d14,d24                  @(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    508 
    509     vld1.u32    {d2},[r12],r11              @(1_1)vector load pu1_src
    510     vmlal.u8    q6,d16,d26                  @(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    511 
    512     @vld1.u32  {d3},[r12],r2               @(1_1)vector load pu1_src
    513     vld1.u32    {d3},[r12],r8               @(1_1)vector load pu1_src
    514     vmlsl.u8    q6,d17,d27                  @(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    515 
    516     @sub       r12, r12, #6                @(1_1)
    517 
    518     vst1.8      {d8},[r4],r3                @(1)store the result pu1_dst
    519     vqrshrun.s16 d10,q5,#6                  @(2)right shift and saturating narrow result 2
    520 
    521     vld1.u32    {d4},[r12],r11              @(2_1)vector load pu1_src
    522     vmull.u8    q11,d19,d25                 @(4)mul_res = vmull_u8(src[0_3], coeffabs_3)@
    523 
    524     vld1.u32    {d5},[r12],r11              @(2_1)vector load pu1_src
    525     vmlsl.u8    q11,d18,d24                 @(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    526 
    527     vld1.u32    {d6},[r12],r11              @(2_1)vector load pu1_src
    528     vmlal.u8    q11,d20,d26                 @(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    529 
    530     @vld1.u32  {d7},[r12],r2               @(2_1)vector load pu1_src
    531     vld1.u32    {d7},[r12],r8               @(2_1)vector load pu1_src
    532     vmlsl.u8    q11,d21,d27                 @(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    533 
    534     @sub       r12, r12, #6                @(2_1)
    535 
    536     vst1.8      {d10},[r4],r3               @(2)store the result pu1_dst
    537     vqrshrun.s16 d12,q6,#6                  @(3)right shift and saturating narrow result 1
    538 
    539     vld1.u32    {d14},[r12],r11             @(3_1)vector load pu1_src
    540     vmull.u8    q4,d1,d25                   @(1_1)mul_res = vmull_u8(src[0_3], coeffabs_3)@
    541 
    542     vld1.u32    {d15},[r12],r11             @(3_1)vector load pu1_src
    543     vmlsl.u8    q4,d0,d24                   @(1_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    544 
    545     vld1.u32    {d16},[r12],r11             @(3_1)vector load pu1_src
    546     vmlal.u8    q4,d2,d26                   @(1_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    547 
    548     @vld1.u32  {d17},[r12],r2              @(3_1)vector load pu1_src
    549     vld1.u32    {d17},[r12],r8              @(3_1)vector load pu1_src
    550     vmlsl.u8    q4,d3,d27                   @(1_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    551 
    552     @sub       r12, r12, #6                @(3_1)
    553 
    554     vst1.8      {d12},[r4],r3               @(3)store the result pu1_dst
    555     vqrshrun.s16 d22,q11,#6                 @(4)right shift and saturating narrow result 2
    556 
    557     add         r9,r9,#8                    @(core loop)
    558 
    559     vmull.u8    q5,d5,d25                   @(2_1)mul_res = vmull_u8(src[0_3], coeffabs_3)@
    560     vld1.u32    {d18},[r12],r11             @(4_1)vector load pu1_src
    561 
    562     vld1.u32    {d19},[r12],r11             @(4_1)vector load pu1_src
    563     vmlsl.u8    q5,d4,d24                   @(2_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    564 
    565     vld1.u32    {d20},[r12],r11             @(4_1)vector load pu1_src
    566     vmlal.u8    q5,d6,d26                   @(2_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    567 
    568     vld1.u32    {d21},[r12],r2              @(4_1)vector load pu1_src
    569     vmlsl.u8    q5,d7,d27                   @(2_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    570 
    571     add         r1,r1,#8                    @(core loop)
    572 
    573     subs        r7,r7,#8                    @(core loop)
    574 
    575     vst1.8      {d22}, [r4], r3             @(4)store the result pu1_dst
    576     vqrshrun.s16 d8,q4,#6                   @(1_1)right shift and saturating narrow result 1
    577 
    578     mov         r4, r1                      @(core loop)
    579 
    580     bgt         core_loop                   @loopback
    581 
    582 epilogue:
    583     vmull.u8    q6,d15,d25                  @(3)mul_res = vmull_u8(src[0_3], coeffabs_3)@
    584 
    585     vmlsl.u8    q6,d14,d24                  @(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    586 
    587     vmlal.u8    q6,d16,d26                  @(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    588 
    589     vmlsl.u8    q6,d17,d27                  @(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    590 
    591     vst1.8      {d8},[r4],r3                @(1)store the result pu1_dst
    592     vqrshrun.s16 d10,q5,#6                  @(2)right shift and saturating narrow result 2
    593 
    594     vmull.u8    q11,d19,d25                 @(4)mul_res = vmull_u8(src[0_3], coeffabs_3)@
    595     vmlsl.u8    q11,d18,d24                 @(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    596 
    597     vmlal.u8    q11,d20,d26                 @(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    598 
    599     vmlsl.u8    q11,d21,d27                 @(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    600 
    601     vst1.8      {d10},[r4],r3               @(2)store the result pu1_dst
    602     vqrshrun.s16 d12,q6,#6                  @(3)right shift and saturating narrow result 1
    603 
    604     vst1.8      {d12},[r4],r3               @(3)store the result pu1_dst
    605 
    606     add         r1,r1,#8                    @(core loop)
    607 
    608     vqrshrun.s16 d22,q11,#6                 @(4)right shift and saturating narrow result 2
    609 
    610 
    611     vst1.8      {d22}, [r4], r3             @(4)store the result pu1_dst
    612 
    613     sub         r9,r9,r5
    614     subs        r14,r14,#4                  @decrement the ht loop
    615     sub         r1,r1,r5
    616     add         r9,r9,r2,lsl #2
    617     add         r1,r1,r3,lsl #2
    618     bgt         outer_loop_ht_4
    619     b           end_loops
    620 
    621 outer_loop_4:
    622     add         r6,r1,r3                    @pu1_dst + dst_strd
    623     mov         r7,r5
    624     add         r4,r12,r2                   @pu1_src + src_strd
    625 
    626 inner_loop_4:
    627     @vld1.u32  {d0,d1},[r12]                   @vector load pu1_src
    628 
    629     vld1.u32    {d0},[r12],r11              @vector load pu1_src
    630     vld1.u32    {d1},[r12],r11              @vector load pu1_src
    631     vld1.u32    {d2},[r12],r11              @vector load pu1_src
    632     vld1.u32    {d3},[r12]                  @vector load pu1_src
    633 
    634     sub         r12,r12,#2                  @increment the input pointer
    635     vld1.u32    {d4},[r4],r11               @vector load pu1_src
    636     vld1.u32    {d5},[r4],r11               @vector load pu1_src
    637     vld1.u32    {d6},[r4],r11               @vector load pu1_src
    638     vld1.u32    {d7},[r4]                   @vector load pu1_src
    639     @vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
    640     @vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
    641     @vld1.u32  {d12,d13},[r4]                  @vector load pu1_src + src_strd
    642     @vext.u8   d6,d0,d1,#6                     @vector extract of src[0_6]
    643 
    644     sub         r4,r4,#2                    @increment the input pointer
    645     @vext.u8   d14,d12,d13,#2                  @vector extract of src[0_2]
    646     @vext.u8   d16,d12,d13,#4                  @vector extract of src[0_4]
    647     @vext.u8   d18,d12,d13,#6                  @vector extract of src[0_6]
    648 
    649     vzip.32     d0,d4                       @vector zip the i iteration and ii interation in single register
    650     vzip.32     d1,d5
    651     vzip.32     d2,d6
    652     vzip.32     d3,d7
    653 
    654     vmull.u8    q4,d1,d25                   @arithmetic operations for ii iteration in the same time
    655     vmlsl.u8    q4,d0,d24
    656     vmlal.u8    q4,d2,d26
    657     vmlsl.u8    q4,d3,d27
    658 
    659     vqrshrun.s16 d8,q4,#6                   @narrow right shift and saturating the result
    660     vst1.32     {d8[0]},[r1]!               @store the i iteration result which is in upper part of the register
    661     subs        r7,r7,#4                    @decrement the wd by 4
    662 
    663     vst1.32     {d8[1]},[r6]!               @store the ii iteration result which is in lower part of the register
    664 
    665     bgt         inner_loop_4
    666 
    667     sub         r12,r12,r5
    668     subs        r14,r14,#2                  @decrement the ht by 2
    669     sub         r1,r1,r5
    670     add         r12,r12,r2,lsl #1
    671     add         r1,r1,r3,lsl #1
    672     bgt         outer_loop_4
    673 
    674 end_loops:
    675 
    676     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    677 
    678 
    679 
    680 
    681 
    682 
    683 
    684 
    685