Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* @file
     21 @*  ihevc_weighted_pred_bi_default.s
     22 @*
     23 @* @brief
     24 @*  contains function definitions for weighted prediction used in inter
     25 @* prediction
     26 @*
     27 @* @author
     28 @*  parthiban v
     29 @*
     30 @* @par list of functions:
     31 @*  - ihevc_weighted_pred_bi_default()
     32 @*
     33 @* @remarks
     34 @*  none
     35 @*
     36 @*******************************************************************************
     37 @*/
     38 @/**
     39 @*******************************************************************************
     40 @*
     41 @* @brief
     42 @*  does default bi-weighted prediction on the arrays pointed by pi2_src1 and
     43 @* pi2_src2 and stores it at location  pointed by pi2_dst assumptions : the
     44 @* function is optimized considering the fact width and  height are multiple
     45 @* of 2.
     46 @*
     47 @* @par description:
     48 @*  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
     49 @* >> shift  where shift = 15 - bitdepth
     50 @*
     51 @* @param[in] pi2_src1
     52 @*  pointer to source 1
     53 @*
     54 @* @param[in] pi2_src2
     55 @*  pointer to source 2
     56 @*
     57 @* @param[out] pu1_dst
     58 @*  pointer to destination
     59 @*
     60 @* @param[in] src_strd1
     61 @*  source stride 1
     62 @*
     63 @* @param[in] src_strd2
     64 @*  source stride 2
     65 @*
     66 @* @param[in] dst_strd
     67 @*  destination stride
     68 @*
     69 @* @param[in] lvl_shift1
     70 @*  added before shift and offset
     71 @*
     72 @* @param[in] lvl_shift2
     73 @*  added before shift and offset
     74 @*
     75 @* @param[in] ht
     76 @*  height of the source
     77 @*
     78 @* @param[in] wd
     79 @*  width of the source
     80 @*
     81 @* @returns
     82 @*
     83 @* @remarks
     84 @*  none
     85 @*
     86 @*******************************************************************************
     87 @*/
     88 @void ihevc_weighted_pred_bi_default(word16 *pi2_src1,
     89 @                                    word16 *pi2_src2,
     90 @                                    uword8 *pu1_dst,
     91 @                                    word32 src_strd1,
     92 @                                    word32 src_strd2,
     93 @                                    word32 dst_strd,
     94 @                                    word32 lvl_shift1,
     95 @                                    word32 lvl_shift2,
     96 @                                    word32 ht,
     97 @                                    word32 wd)
     98 
     99 @**************variables vs registers*****************************************
    100 @   r0 => *pi2_src1
    101 @   r1 => *pi2_src2
    102 @   r2 => *pu1_dst
    103 @   r3 =>  src_strd1
    104 @   r4 =>  src_strd2
    105 @   r5 =>  dst_strd
    106 @   r6 =>  lvl_shift1
    107 @   r7 =>  lvl_shift2
    108 @   r8 =>  ht
    109 @   r9 =>  wd
    110 
    111 .equ    src_strd2_offset,       104
    112 .equ    dst_strd_offset,        108
    113 .equ    lvl_shift1_offset,      112
    114 .equ    lvl_shift2_offset,      116
    115 .equ    ht_offset,              120
    116 .equ    wd_offset,              124
    117 
    118 .text
    119 .syntax unified
    120 .align 4
    121 
    122 
    123 
    124 
    125 .globl ihevc_weighted_pred_bi_default_a9q
    126 
    127 .type ihevc_weighted_pred_bi_default_a9q, %function
    128 
    129 ihevc_weighted_pred_bi_default_a9q:
    130 
    131     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    132     vpush       {d8  -  d15}
    133     ldr         r4,[sp,#src_strd2_offset]   @load src_strd2
    134     lsl         r3,r3,#1
    135     ldr         r5,[sp,#dst_strd_offset]    @load dst_strd
    136     ldr         r6,[sp,#lvl_shift1_offset]  @load lvl_shift1
    137     lsl         r4,r4,#1
    138     ldr         r7,[sp,#lvl_shift2_offset]  @load lvl_shift2
    139     ldr         r8,[sp,#ht_offset]          @load ht
    140     ldr         r9,[sp,#wd_offset]          @load wd
    141     vdup.16     q2,r6                       @lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1)
    142     vdup.16     q3,r7                       @lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2)
    143     vmov.i16    q0,#0x40                    @tmp_lvl_shift = 1 << (shift - 1)
    144     vadd.i16    q2,q3
    145     vadd.s16    q0,q0,q2
    146 @   vmvn.i32    q1,#0x6                         @vmovq_n_s32(tmp_shift)
    147     lsl         r6,r9,#1
    148     rsb         r7,r6,r3,lsl #2             @4*src_strd1 - wd
    149     rsb         r10,r6,r4,lsl #2            @4*src_strd2 - wd
    150     @asr            r6,#1
    151     @rsb            r6,r6,r5,lsl #2             @4*dst_strd - wd
    152 
    153     cmp         r8,#0                       @check ht == 0
    154     beq         end_loops                   @if equal, then end the function
    155 
    156 chroma_decision:
    157     orr         r14,r8,r9
    158     cmp         r14,#10
    159     beq         outer_loop_chroma_8x2
    160 
    161     cmp         r14,#6
    162     beq         outer_loop_chroma_4x2
    163 
    164 
    165 luma_decision:
    166     cmp         r9,#24
    167     beq         outer_loop_8
    168 
    169     cmp         r9,#16
    170     bge         outer_loop_16
    171 
    172     cmp         r9,#12
    173     beq         outer_loop_4
    174 
    175     cmp         r9,#8
    176     bge         outer_loop_8
    177 
    178 
    179 
    180 
    181 
    182 
    183 outer_loop_4:
    184     cmp         r9,#0                       @check wd == 0
    185     beq         end_loops                   @if equal, then end the function
    186 
    187 core_loop_4:
    188     add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
    189     add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
    190     vld1.s16    {d6},[r0]!                  @load and increment the pi2_src1
    191     add         r14,r2,r5                   @pu1_dst_tmp = pu1_dst + dst_strd
    192     vld1.s16    {d7},[r1]!                  @load and increment the pi2_src2
    193     vld1.s16    {d8},[r11],r3               @load and increment the pi2_src1 ii iteration
    194     vqadd.s16   d18,d6,d7
    195     vqadd.s16   d18,d18,d0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
    196     vld1.s16    {d9},[r12],r4               @load and increment the pi2_src2 ii iteration
    197     vqadd.s16   d20,d8,d9                   @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
    198     vqadd.s16   d19,d20,d0                  @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
    199     vqshrun.s16 d20,q9,#7
    200     vld1.s16    {d22},[r11],r3              @load and increment the pi2_src1 iii iteration
    201     vld1.s16    {d23},[r12],r4              @load and increment the pi2_src2 iii iteration
    202     vqadd.s16   d30,d22,d23
    203     vqadd.s16   d30,d30,d0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
    204     vld1.s16    {d24},[r11],r3              @load and increment the pi2_src1 iv iteration
    205     vld1.s16    {d25},[r12],r4              @load and increment the pi2_src2 iv iteration
    206     vqadd.s16   d18,d24,d25                 @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
    207     vqadd.s16   d31,d18,d0
    208     vst1.32     {d20[0]},[r2]!              @store pu1_dst i iteration
    209     vst1.32     {d20[1]},[r14],r5           @store pu1_dst ii iteration
    210     vqshrun.s16 d30,q15,#7
    211     vst1.32     {d30[0]},[r14],r5           @store pu1_dst iii iteration                                                @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
    212     subs        r9,r9,#4                    @decrement wd by 4 and check for 0
    213     vst1.32     {d30[1]},[r14],r5           @store pu1_dst iv iteration
    214     bgt         core_loop_4                 @if greater than 0 repeat the core loop again
    215 
    216 end_core_loop_4:
    217 
    218     subs        r8,r8,#4                    @decrement the ht by 4
    219 
    220     add         r0,r0,r7                    @pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
    221     asr         r9,r6,#1
    222     add         r1,r1,r10                   @pi2_src2 + 4*src_strd2 - 2*wd
    223     rsb         r14,r9,r5,lsl #2            @4*dst_strd - wd
    224     add         r2,r2,r14
    225                                             @pu1_dst + dst_std - wd
    226     bgt         core_loop_4                 @if ht is greater than 0 goto outer_loop
    227 
    228     b           end_loops
    229 
    230 
    231 @ this is only for chroma module with input 2x2
    232 outer_loop_chroma_4x2:
    233     cmp         r9,#0                       @check wd == 0
    234     beq         end_loops                   @if equal, then end the function
    235     rsb         r7,r6,r3,lsl #1             @2*src_strd1 - wd
    236     rsb         r10,r6,r4,lsl #1            @2*src_strd2 - wd
    237 core_loop_chroma_4x2:
    238     add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
    239     add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
    240     vld1.s16    {d6},[r0]!                  @load and increment the pi2_src1
    241     add         r14,r2,r5                   @pu1_dst_tmp = pu1_dst + dst_strd
    242     vld1.s16    {d7},[r1]!                  @load and increment the pi2_src2
    243     vld1.s16    {d8},[r11],r3               @load and increment the pi2_src1 ii iteration
    244     vqadd.s16   d18,d6,d7
    245     vqadd.s16   d18,d18,d0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
    246     vld1.s16    {d9},[r12],r4               @load and increment the pi2_src2 ii iteration
    247     vqadd.s16   d20,d8,d9                   @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
    248     vqadd.s16   d19,d20,d0                  @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
    249     vqshrun.s16 d20,q9,#7
    250     vst1.32     {d20[0]},[r2]!              @store pu1_dst i iteration
    251     vst1.32     {d20[1]},[r14],r5           @store pu1_dst ii iteration
    252 
    253     subs        r9,r9,#4                    @decrement wd by 4 and check for 0
    254 
    255     bgt         core_loop_chroma_4x2        @if greater than 0 repeat the core loop again
    256 
    257 end_core_loop_chorma_4x2:
    258 
    259     subs        r8,r8,#2                    @decrement the ht by 4
    260 
    261     add         r0,r0,r7                    @pi2_src1 + 2*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
    262     asr         r9,r6,#1
    263     add         r1,r1,r10                   @pi2_src2 + 2*src_strd2 - 2*wd
    264     rsb         r14,r9,r5,lsl #1            @2*dst_strd - wd
    265     add         r2,r2,r14
    266                                             @pu1_dst + dst_std - wd
    267     bgt         core_loop_chroma_4x2        @if ht is greater than 0 goto outer_loop
    268 
    269     b           end_loops
    270 
    271 
    272 
    273 outer_loop_8:
    274     cmp         r9,#0                       @check wd == 0
    275     beq         end_loops                   @if equal, then end the function
    276     add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
    277     add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
    278 core_loop_8:
    279 
    280     vld1.s16    {q12},[r0]!                 @load and increment the pi2_src1
    281     add         r14,r2,r5                   @pu1_dst_tmp = pu1_dst + dst_strd
    282     vld1.s16    {q13},[r1]!                 @load and increment the pi2_src2
    283     vqadd.s16   q12,q12,q13
    284     vld1.s16    {q14},[r11],r3              @load and increment the pi2_src1 ii iteration
    285     vqadd.s16   q12,q12,q0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
    286     vld1.s16    {q15},[r12],r4              @load and increment the pi2_src2 ii iteration
    287     vld1.s16    {q8},[r11],r3               @load and increment the pi2_src1 iii iteration
    288     vqadd.s16   q11,q14,q15                 @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
    289     vld1.s16    {q9},[r12],r4               @load and increment the pi2_src2 iii iteration
    290     vqadd.s16   q11,q11,q0                  @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
    291     vqshrun.s16 d20,q12,#7
    292     vld1.s16    {q6},[r11],r3               @load and increment the pi2_src1 iv iteration
    293     vqadd.s16   q15,q8,q9
    294     vqshrun.s16 d21,q11,#7
    295     vld1.s16    {q7},[r12],r4               @load and increment the pi2_src2 iv iteration
    296     vqadd.s16   q15,q15,q0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
    297     vst1.32     {d20},[r2]!                 @store pu1_dst i iteration
    298     vqadd.s16   q4,q6,q7                    @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
    299     vst1.32     {d21},[r14],r5              @store pu1_dst ii iteration
    300     vqadd.s16   q4,q4,q0
    301     vqshrun.s16 d30,q15,#7
    302     vqshrun.s16 d31,q4,#7
    303     add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
    304     add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
    305     vst1.32     {d30},[r14],r5              @store pu1_dst iii iteration                                                @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
    306     subs        r9,r9,#8                    @decrement wd by 4 and check for 0
    307     vst1.32     {d31},[r14],r5              @store pu1_dst iv iteration
    308     bgt         core_loop_8                 @if greater than 0 repeat the core loop again
    309 
    310 end_core_loop_8:
    311 
    312     subs        r8,r8,#4                    @decrement the ht by 4
    313 
    314     add         r0,r0,r7                    @pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
    315     asr         r9,r6,#1
    316     add         r1,r1,r10                   @pi2_src2 + 4*src_strd2 - 2*wd
    317     rsb         r14,r9,r5,lsl #2            @4*dst_strd - wd
    318     add         r2,r2,r14
    319     add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
    320     add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)                                  @pu1_dst + dst_std - wd
    321 
    322     bgt         core_loop_8
    323     b           end_loops
    324 
    325 
    326 
    327 @ this is only for chroma module with inpput 4x2
    328 outer_loop_chroma_8x2:
    329     cmp         r9,#0                       @check wd == 0
    330     beq         end_loops                   @if equal, then end the function
    331     add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
    332     add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
    333     rsb         r7,r6,r3,lsl #1             @2*src_strd1 - wd
    334     rsb         r10,r6,r4,lsl #1            @2*src_strd2 - wd
    335 core_loop_chroma_8x2:
    336 
    337     vld1.s16    {q12},[r0]!                 @load and increment the pi2_src1
    338     add         r14,r2,r5                   @pu1_dst_tmp = pu1_dst + dst_strd
    339     vld1.s16    {q13},[r1]!                 @load and increment the pi2_src2
    340     vqadd.s16   q12,q12,q13
    341     vld1.s16    {q14},[r11],r3              @load and increment the pi2_src1 ii iteration
    342     vqadd.s16   q12,q12,q0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
    343     vld1.s16    {q15},[r12],r4              @load and increment the pi2_src2 ii iteration
    344     vld1.s16    {q8},[r11],r3               @load and increment the pi2_src1 iii iteration
    345     vqadd.s16   q11,q14,q15                 @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
    346     vqadd.s16   q11,q11,q0                  @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
    347     vqshrun.s16 d20,q12,#7
    348     vqshrun.s16 d21,q11,#7
    349     vst1.32     {d20},[r2]!                 @store pu1_dst i iteration
    350     vst1.32     {d21},[r14],r5              @store pu1_dst ii iteration
    351 
    352     add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
    353     add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
    354                                             @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
    355     subs        r9,r9,#8                    @decrement wd by 4 and check for 0
    356 
    357     bgt         core_loop_chroma_8x2        @if greater than 0 repeat the core loop again
    358 
    359 end_core_loop_chroma_8x2:
    360 
    361     subs        r8,r8,#2                    @decrement the ht by 4
    362 
    363     add         r0,r0,r7                    @pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
    364     asr         r9,r6,#1
    365     add         r1,r1,r10                   @pi2_src2 + 4*src_strd2 - 2*wd
    366     rsb         r14,r9,r5,lsl #1            @4*dst_strd - wd
    367     add         r2,r2,r14
    368     add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
    369     add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)                                  @pu1_dst + dst_std - wd
    370 
    371     bgt         core_loop_chroma_8x2
    372 
    373     b           end_loops
    374 
    375 
    376 
    377 
    378 outer_loop_16:
    379     cmp         r9,#0                       @check wd == 0
    380     beq         end_loops                   @if equal, then end the function
    381     add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
    382     add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
    383     rsb         r7,r6,r3,lsl #1             @2*src_strd1 - wd
    384     mov         r14,#16
    385     sub         r10,r14,r5
    386     sub         r11,r3,r14
    387     sub         r12,r14,r3
    388 
    389     rsb         r14,r9,r5,lsl #1            @2*dst_strd - wd
    390 
    391 
    392 
    393 prolog_16:
    394 
    395 
    396     vld1.s16    {q1},[r0]!                  @load and increment the pi2_src1
    397     vld1.s16    {q2},[r1]!                  @load and increment the pi2_src2
    398     vld1.s16    {q5},[r0],r11               @load and increment the pi2_src1
    399     vld1.s16    {q6},[r1],r11               @load and increment the pi2_src2
    400     vld1.s16    {q3},[r0]!                  @load and increment the pi2_src1 ii iteration
    401     subs        r9,r9,#16
    402     vld1.s16    {q4},[r1]!                  @load and increment the pi2_src2 ii iteration
    403     subeq       r8,r8,#2
    404     vqadd.s16   q11,q1,q2
    405     vld1.s16    {q7},[r0],r12               @load and increment the pi2_src1 ii iteration
    406     vqadd.s16   q14,q5,q6
    407     vld1.s16    {q8},[r1],r12               @load and increment the pi2_src2 ii iteration
    408     addeq       r0,r0,r7
    409     addeq       r1,r1,r7
    410     vqadd.s16   q12,q3,q4
    411     vld1.s16    {q1},[r0]!
    412     vqadd.s16   q13,q7,q8
    413 @ if the input is chroma with 8x2 block size
    414     cmp         r8,#0
    415     beq         epilog_16
    416 
    417     vld1.s16    {q2},[r1]!                  @load and increment the pi2_src2
    418     vqadd.s16   q11,q11,q0
    419     vld1.s16    {q5},[r0],r11               @load and increment the pi2_src1
    420     vqadd.s16   q14,q14,q0
    421     vld1.s16    {q6},[r1],r11               @load and increment the pi2_src2
    422     vqadd.s16   q12,q12,q0
    423     vld1.s16    {q3},[r0]!                  @load and increment the pi2_src1 ii iteration
    424     vqadd.s16   q15,q13,q0
    425     vqshrun.s16 d20,q11,#7
    426     vld1.s16    {q4},[r1]!                  @load and increment the pi2_src2 ii iteration
    427     vqshrun.s16 d21,q14,#7
    428     vld1.s16    {q7},[r0],r12               @load and increment the pi2_src1 ii iteration
    429     vqshrun.s16 d26,q12,#7
    430     vld1.s16    {q8},[r1],r12               @load and increment the pi2_src2 ii iteration
    431     vqshrun.s16 d27,q15,#7
    432 
    433 
    434 
    435 core_loop_16:
    436 
    437     cmp         r9,#0
    438     vqadd.s16   q11,q1,q2
    439     asreq       r9,r6,#1
    440     vst1.32     {q10},[r2],r5
    441     vqadd.s16   q14,q5,q6
    442     vst1.32     {q13},[r2],r10
    443     addeq       r2,r2,r14
    444     vqadd.s16   q12,q3,q4
    445     subs        r9,r9,#16
    446     addeq       r0,r0,r7
    447     vqadd.s16   q13,q7,q8
    448 
    449     addeq       r1,r1,r7
    450     subseq      r8,r8,#2                    @decrement the ht by 2
    451     beq         epilog_16
    452 
    453 
    454     vqadd.s16   q11,q11,q0
    455     vld1.s16    {q1},[r0]!                  @load and increment the pi2_src1
    456     vqadd.s16   q14,q14,q0
    457     vld1.s16    {q2},[r1]!                  @load and increment the pi2_src2
    458     vqadd.s16   q12,q12,q0
    459     vld1.s16    {q5},[r0],r11               @load and increment the pi2_src1
    460     vqadd.s16   q15,q13,q0
    461     vld1.s16    {q6},[r1],r11               @load and increment the pi2_src2
    462     vqshrun.s16 d20,q11,#7
    463     vld1.s16    {q3},[r0]!                  @load and increment the pi2_src1 ii iteration
    464     vqshrun.s16 d21,q14,#7
    465     vld1.s16    {q4},[r1]!                  @load and increment the pi2_src2 ii iteration
    466     vqshrun.s16 d26,q12,#7
    467     vld1.s16    {q7},[r0],r12               @load and increment the pi2_src1 ii iteration
    468     vqshrun.s16 d27,q15,#7
    469     vld1.s16    {q8},[r1],r12               @load and increment the pi2_src2 ii iteration
    470 
    471 
    472     b           core_loop_16
    473 
    474 
    475 epilog_16:
    476 
    477     vqadd.s16   q11,q11,q0
    478     vqadd.s16   q14,q14,q0
    479     vqadd.s16   q12,q12,q0
    480     vqadd.s16   q15,q13,q0
    481     vqshrun.s16 d20,q11,#7
    482     vqshrun.s16 d21,q14,#7
    483     vqshrun.s16 d26,q12,#7
    484     vqshrun.s16 d27,q15,#7
    485     vst1.32     {q10},[r2],r5
    486     vst1.32     {q13},[r2]
    487 
    488 
    489 
    490 end_core_loop_16:
    491 
    492 
    493 
    494 
    495 
    496 
    497 
    498 
    499 end_loops:
    500     vpop        {d8  -  d15}
    501     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    502 
    503 
    504 
    505 
    506