Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @******************************************************************************
     20 @* @file
     21 @*  ihevc_inter_pred_luma_horz.s
     22 @*
     23 @* @brief
     24 @*  contains function definitions for inter prediction  interpolation.
     25 @* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 @* rvct
     28 @*
     29 @* @author
     30 @*  parthiban v
     31 @*
     32 @* @par list of functions:
     33 @*
     34 @*  - ihevc_inter_pred_luma_horz()
     35 @*
     36 @* @remarks
     37 @*  none
     38 @*
     39 @*******************************************************************************
     40 @*/
     41 
     42 @/* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */
     43 @/* include reconstruction */
     44 @
     45 
     46 @/**
     47 @*******************************************************************************
     48 @*
     49 @* @brief
     50 @*     interprediction luma filter for vertical input
     51 @*
     52 @* @par description:
     53 @*    applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
     54 @*    the elements pointed by 'pu1_src' and  writes to the location pointed by
     55 @*    'pu1_dst'  the output is downshifted by 6 and clipped to 8 bits
     56 @*    assumptions : the function is optimized considering the fact width is
     57 @*    multiple of 4 or 8. and height as multiple of 2.
     58 @*
     59 @* @param[in] pu1_src
     60 @*  uword8 pointer to the source
     61 @*
     62 @* @param[out] pu1_dst
     63 @*  uword8 pointer to the destination
     64 @*
     65 @* @param[in] src_strd
     66 @*  integer source stride
     67 @*
     68 @* @param[in] dst_strd
     69 @*  integer destination stride
     70 @*
     71 @* @param[in] pi1_coeff
     72 @*  word8 pointer to the filter coefficients
     73 @*
     74 @* @param[in] ht
     75 @*  integer height of the array
     76 @*
     77 @* @param[in] wd
     78 @*  integer width of the array
     79 @*
     80 @* @returns
     81 @*
     82 @* @remarks
     83 @*  none
     84 @*
     85 @*******************************************************************************
     86 @*/
     87 
     88 @void ihevc_inter_pred_luma_horz (
     89 @                            uword8 *pu1_src,
     90 @                            uword8 *pu1_dst,
     91 @                            word32 src_strd,
     92 @                            word32 dst_strd,
     93 @                            word8 *pi1_coeff,
     94 @                            word32 ht,
     95 @                            word32 wd   )
     96 
     97 @**************variables vs registers*****************************************
     98 @   r0 => *pu1_src
     99 @   r1 => *pu1_dst
    100 @   r2 =>  src_strd
    101 @   r3 =>  dst_strd
    102 @   r4 => *pi1_coeff
    103 @   r5 =>  ht
    104 @   r6 =>  wd
    105 
    106 .text
    107 .align 4
    108 
    109 
    110 
    111 
    112 .globl ihevc_inter_pred_luma_horz_a9q
    113 
    114 .type ihevc_inter_pred_luma_horz_a9q, %function
    115 
    116 ihevc_inter_pred_luma_horz_a9q:
    117 
    118     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    119     @str        r1,[sp,#-4]
    120     @ mov       r7,#8192
    121 start_loop_count:
    122     @ ldr       r1,[sp,#-4]
    123 
    124 
    125     ldr         r4,[sp,#40]                 @loads pi1_coeff
    126     ldr         r8,[sp,#44]                 @loads ht
    127     ldr         r10,[sp,#48]                @loads wd
    128 
    129     vld1.8      {d0},[r4]                   @coeff = vld1_s8(pi1_coeff)
    130     mov         r11,#1
    131     subs        r14,r8,#0                   @checks for ht == 0
    132 
    133     vabs.s8     d2,d0                       @vabs_s8(coeff)
    134 
    135     @ble       end_loops
    136 
    137 
    138     vdup.8      d24,d2[0]                   @coeffabs_0 = vdup_lane_u8(coeffabs, 0)
    139     sub         r12,r0,#3                   @pu1_src - 3
    140     vdup.8      d25,d2[1]                   @coeffabs_1 = vdup_lane_u8(coeffabs, 1)
    141     add         r4,r12,r2                   @pu1_src_tmp2_8 = pu1_src + src_strd
    142     vdup.8      d26,d2[2]                   @coeffabs_2 = vdup_lane_u8(coeffabs, 2)
    143     rsb         r9,r10,r2,lsl #1            @2*src_strd - wd
    144     vdup.8      d27,d2[3]                   @coeffabs_3 = vdup_lane_u8(coeffabs, 3)
    145     rsb         r8,r10,r3,lsl #1            @2*dst_strd - wd
    146     vdup.8      d28,d2[4]                   @coeffabs_4 = vdup_lane_u8(coeffabs, 4)
    147 
    148     vdup.8      d29,d2[5]                   @coeffabs_5 = vdup_lane_u8(coeffabs, 5)
    149     @ tst       r10,#7                          @checks wd for multiples
    150     vdup.8      d30,d2[6]                   @coeffabs_6 = vdup_lane_u8(coeffabs, 6)
    151     vdup.8      d31,d2[7]                   @coeffabs_7 = vdup_lane_u8(coeffabs, 7)
    152 
    153     mov         r7,r1
    154 
    155     cmp         r10,#4
    156     ble         outer_loop_4
    157 
    158     cmp         r10,#24
    159     moveq       r10,#16
    160     addeq       r8,#8
    161     addeq       r9,#8
    162 
    163     cmp         r10,#16
    164     bge         outer_loop_16
    165 
    166     cmp         r10,#12
    167     addeq       r8,#4
    168     addeq       r9,#4
    169     b           outer_loop_8
    170 
    171 
    172 outer_loop8_residual:
    173     sub         r12,r0,#3                   @pu1_src - 3
    174     mov         r1,r7
    175     mov         r14,#32
    176     add         r1,#16
    177     add         r12,#16
    178     mov         r10,#8
    179     add         r8,#8
    180     add         r9,#8
    181 
    182 outer_loop_8:
    183 
    184     add         r6,r1,r3                    @pu1_dst + dst_strd
    185     add         r4,r12,r2                   @pu1_src + src_strd
    186     subs        r5,r10,#0                   @checks wd
    187 
    188     ble         end_inner_loop_8
    189 
    190 inner_loop_8:
    191     vld1.u32    {d0},[r12],r11              @vector load pu1_src
    192     vld1.u32    {d1},[r12],r11
    193     vld1.u32    {d2},[r12],r11
    194     vld1.u32    {d3},[r12],r11
    195 
    196 
    197 
    198 
    199 
    200     @ vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
    201     @ vext.u8   d3,d0,d1,#3                     @vector extract of src[0_3]
    202     @ vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
    203     @ vext.u8   d5,d0,d1,#5                     @vector extract of src[0_5]
    204     @ vext.u8   d6,d0,d1,#6                     @vector extract of src [0_6]
    205     @ vext.u8   d7,d0,d1,#7                     @vector extract of src[0_7]
    206     @ vext.u8   d1,d0,d1,#1                     @vector extract of src[0_1]
    207     @ vext.u8   d14,d12,d13,#2
    208 
    209     @vext.u8    d15,d12,d13,#3                  @vector extract of src[0_3]
    210     @ vext.u8   d16,d12,d13,#4                  @vector extract of src[0_4]
    211     @ vext.u8   d17,d12,d13,#5                  @vector extract of src[0_5]
    212     @vext.u8   d18,d12,d13,#6                  @vector extract of src[0_6]
    213     @vext.u8    d19,d12,d13,#7                  @vector extract of src[0_7]
    214     @vext.u8   d13,d12,d13,#1                  @vector extract of src[0_1]
    215     vld1.u32    {d4},[r12],r11
    216     vmull.u8    q4,d1,d25                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    217     vld1.u32    {d5},[r12],r11
    218     vmlal.u8    q4,d3,d27                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    219     vld1.u32    {d6},[r12],r11
    220     vmlsl.u8    q4,d0,d24                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    221     vld1.u32    {d7},[r12],r11
    222     vmlsl.u8    q4,d2,d26                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    223     vld1.u32    {d12},[r4],r11              @vector load pu1_src + src_strd
    224     vmlal.u8    q4,d4,d28                   @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
    225     vld1.u32    {d13},[r4],r11
    226     vmlsl.u8    q4,d5,d29                   @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
    227     vld1.u32    {d14},[r4],r11
    228     vmlal.u8    q4,d6,d30                   @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
    229     vld1.u32    {d15},[r4],r11
    230     vmlsl.u8    q4,d7,d31                   @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
    231     vld1.u32    {d16},[r4],r11              @vector load pu1_src + src_strd
    232 
    233     vmull.u8    q5,d15,d27                  @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    234     vld1.u32    {d17},[r4],r11
    235     vmlsl.u8    q5,d14,d26                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    236     vld1.u32    {d18},[r4],r11
    237     vmlal.u8    q5,d16,d28                  @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
    238     vld1.u32    {d19},[r4],r11              @vector load pu1_src + src_strd
    239     vmlsl.u8    q5,d17,d29                  @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
    240     vqrshrun.s16 d20,q4,#6                  @right shift and saturating narrow result 1
    241     vmlal.u8    q5,d18,d30                  @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
    242     vmlsl.u8    q5,d19,d31                  @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
    243     vst1.8      {d20},[r1]!                 @store the result pu1_dst
    244     vmlsl.u8    q5,d12,d24                  @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    245     vmlal.u8    q5,d13,d25                  @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    246 
    247 
    248 
    249     vqrshrun.s16 d8,q5,#6                   @right shift and saturating narrow result 2
    250     subs        r5,r5,#8                    @decrement the wd loop
    251     vst1.8      {d8},[r6]!                  @store the result pu1_dst
    252     cmp         r5,#4
    253     bgt         inner_loop_8
    254 
    255 end_inner_loop_8:
    256     subs        r14,r14,#2                  @decrement the ht loop
    257     add         r12,r12,r9                  @increment the src pointer by 2*src_strd-wd
    258     add         r1,r1,r8                    @increment the dst pointer by 2*dst_strd-wd
    259     bgt         outer_loop_8
    260 
    261 
    262 
    263 
    264 
    265     ldr         r10,[sp,#48]                @loads wd
    266     cmp         r10,#12
    267 
    268     beq         outer_loop4_residual
    269 
    270 
    271 end_loops:
    272 
    273     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    274 
    275 
    276 
    277 
    278 
    279 
    280 outer_loop_16:
    281     str         r0, [sp, #-4]!
    282     str         r7, [sp, #-4]!
    283 
    284     add         r6,r1,r3                    @pu1_dst + dst_strd
    285     add         r4,r12,r2                   @pu1_src + src_strd
    286     and         r0, r12, #31
    287     sub         r5,r10,#0                   @checks wd
    288     @ble       end_loops1
    289     pld         [r12, r2, lsl #1]
    290     vld1.u32    {q0},[r12],r11              @vector load pu1_src
    291     pld         [r4, r2, lsl #1]
    292     vld1.u32    {q1},[r12],r11
    293     vld1.u32    {q2},[r12],r11
    294     vld1.u32    {q3},[r12],r11
    295     vld1.u32    {q6},[r12],r11
    296     vmull.u8    q4,d2,d25                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    297     vld1.u32    {q7},[r12],r11
    298     vmlal.u8    q4,d6,d27                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    299     vld1.u32    {q8},[r12],r11
    300     vmlsl.u8    q4,d0,d24                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    301     vld1.u32    {q9},[r12],r11
    302     vmlsl.u8    q4,d4,d26                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    303     vmlal.u8    q4,d12,d28                  @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
    304     vmlsl.u8    q4,d14,d29                  @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
    305     vmlal.u8    q4,d16,d30                  @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
    306     vmlsl.u8    q4,d18,d31                  @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
    307 
    308 
    309 inner_loop_16:
    310 
    311 
    312     subs        r5,r5,#16
    313     vmull.u8    q10,d3,d25
    314 
    315     add         r12,#8
    316     vmlsl.u8    q10,d1,d24
    317 
    318     subeq       r14,r14,#2
    319     vmlal.u8    q10,d7,d27
    320 
    321     vld1.u32    {q0},[r4],r11               @vector load pu1_src
    322     vmlsl.u8    q10,d5,d26
    323 
    324     vld1.u32    {q1},[r4],r11
    325     vmlal.u8    q10,d13,d28
    326 
    327     vld1.u32    {q2},[r4],r11
    328     vmlal.u8    q10,d17,d30
    329 
    330     vld1.u32    {q3},[r4],r11
    331     vmlsl.u8    q10,d15,d29
    332 
    333     vld1.u32    {q6},[r4],r11
    334     vmlsl.u8    q10,d19,d31
    335 
    336     vld1.u32    {q7},[r4],r11
    337     vqrshrun.s16 d8,q4,#6                   @right shift and saturating narrow result 1
    338 
    339     vld1.u32    {q8},[r4],r11
    340     vmull.u8    q5,d2,d25                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    341 
    342     vld1.u32    {q9},[r4],r11
    343     vmlal.u8    q5,d6,d27                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    344 
    345     pld         [r12, r2, lsl #2]
    346     pld         [r4, r2, lsl #2]
    347 
    348     add         r4,#8
    349     vmlsl.u8    q5,d0,d24                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    350 
    351     addeq       r12,r12,r9                  @increment the src pointer by 2*src_strd-wd
    352     vmlsl.u8    q5,d4,d26                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    353 
    354     addeq       r4,r12,r2                   @pu1_src + src_strd
    355     vqrshrun.s16 d9,q10,#6
    356 
    357     vmlal.u8    q5,d12,d28                  @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
    358 
    359 @   and         r7, r12, #31
    360     vmlsl.u8    q5,d14,d29                  @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
    361 
    362     vmlal.u8    q5,d16,d30                  @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
    363 
    364     vmlsl.u8    q5,d18,d31                  @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
    365 
    366     vmull.u8    q11,d3,d25
    367 
    368     vmlsl.u8    q11,d1,d24
    369 
    370     vst1.8      {q4},[r1]!                  @store the result pu1_dst
    371     vmlal.u8    q11,d7,d27
    372 
    373     addeq       r1,r1,r8
    374     vqrshrun.s16 d10,q5,#6                  @right shift and saturating narrow result 2
    375 
    376 @   cmp         r7, r0
    377     vmlsl.u8    q11,d5,d26
    378 
    379     vmlal.u8    q11,d13,d28
    380 
    381     vmlal.u8    q11,d17,d30
    382 
    383 @   mov         r0, r7
    384     vmlsl.u8    q11,d15,d29
    385 
    386     cmp         r14,#0
    387     vmlsl.u8    q11,d19,d31
    388 
    389     beq         epilog_16
    390     vld1.u32    {q0},[r12],r11              @vector load pu1_src
    391     vld1.u32    {q1},[r12],r11
    392     vld1.u32    {q2},[r12],r11
    393     vld1.u32    {q3},[r12],r11
    394     vld1.u32    {q6},[r12],r11
    395     vqrshrun.s16 d11,q11,#6
    396     vmull.u8    q4,d2,d25                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    397     vld1.u32    {q7},[r12],r11
    398     vmlal.u8    q4,d6,d27                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    399     vld1.u32    {q8},[r12],r11
    400     vmlsl.u8    q4,d0,d24                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    401     vld1.u32    {q9},[r12],r11
    402     vmlsl.u8    q4,d4,d26                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    403     vmlal.u8    q4,d12,d28                  @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
    404     cmp         r5,#0
    405     vmlsl.u8    q4,d14,d29                  @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
    406     moveq       r5,r10
    407     vmlal.u8    q4,d16,d30                  @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
    408     vst1.8      {q5},[r6]!                  @store the result pu1_dst
    409     vmlsl.u8    q4,d18,d31                  @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
    410     addeq       r6,r1,r3                    @pu1_dst + dst_strd
    411     b           inner_loop_16
    412 
    413 
    414 epilog_16:
    415     vqrshrun.s16 d11,q11,#6
    416     vst1.8      {q5},[r6]!                  @store the result pu1_dst
    417 
    418     ldr         r7, [sp], #4
    419     ldr         r0, [sp], #4
    420     ldr         r10,[sp,#48]
    421     cmp         r10,#24
    422 
    423     beq         outer_loop8_residual
    424 
    425 
    426 
    427 end_loops1:
    428 
    429     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    430 
    431 
    432 
    433 
    434 
    435 
    436 
    437 
    438 outer_loop4_residual:
    439     sub         r12,r0,#3                   @pu1_src - 3
    440     mov         r1,r7
    441     add         r1,#8
    442     mov         r10,#4
    443     add         r12,#8
    444     mov         r14,#16
    445     add         r8,#4
    446     add         r9,#4
    447 
    448 outer_loop_4:
    449     add         r6,r1,r3                    @pu1_dst + dst_strd
    450     add         r4,r12,r2                   @pu1_src + src_strd
    451 
    452     subs        r5,r10,#0                   @checks wd
    453     ble         end_inner_loop_4
    454 
    455 inner_loop_4:
    456     vld1.u32    {d0},[r12],r11              @vector load pu1_src
    457     vld1.u32    {d1},[r12],r11
    458     vld1.u32    {d2},[r12],r11
    459     vld1.u32    {d3},[r12],r11
    460     vld1.u32    {d4},[r12],r11
    461     vld1.u32    {d5},[r12],r11
    462     vld1.u32    {d6},[r12],r11
    463     vld1.u32    {d7},[r12],r11
    464     @add       r12,r12,#4                      @increment the input pointer
    465     sub         r12,r12,#4
    466     @vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
    467     @vext.u8   d3,d0,d1,#3                     @vector extract of src[0_3]
    468     @vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
    469 
    470     @vext.u8   d5,d0,d1,#5                     @vector extract of src[0_5]
    471     @vext.u8   d6,d0,d1,#6                     @vector extract of src[0_6]
    472     @vext.u8   d7,d0,d1,#7                     @vector extract of src[0_7]
    473     @vext.u8   d1,d0,d1,#1                     @vector extract of src[0_1]
    474     vld1.u32    {d12},[r4],r11              @vector load pu1_src + src_strd
    475     vld1.u32    {d13},[r4],r11
    476     vzip.32     d0,d12                      @vector zip the i iteration and ii interation in single register
    477     vld1.u32    {d14},[r4],r11
    478     vzip.32     d1,d13
    479     vld1.u32    {d15},[r4],r11
    480     vzip.32     d2,d14
    481     vld1.u32    {d16},[r4],r11
    482     vzip.32     d3,d15
    483     vld1.u32    {d17},[r4],r11
    484     vzip.32     d4,d16
    485     vld1.u32    {d18},[r4],r11
    486     vzip.32     d5,d17
    487     vld1.u32    {d19},[r4],r11
    488     sub         r4,r4,#4
    489     @ add       r4,r4,#4                        @increment the input pointer
    490     @ vext.u8   d14,d12,d13,#2                  @vector extract of src[0_2]
    491     @ vext.u8   d15,d12,d13,#3                  @vector extract of src[0_3]
    492     @ vext.u8   d16,d12,d13,#4                  @vector extract of src[0_4]
    493     @ vext.u8   d17,d12,d13,#5                  @vector extract of src[0_5]
    494     @ vext.u8   d18,d12,d13,#6                  @vector extract of src[0_6]
    495     @ vext.u8   d19,d12,d13,#7                  @vector extract of src[0_7]
    496     @vext.u8   d13,d12,d13,#1                  @vector extract of src[0_1]
    497 
    498 
    499 
    500 
    501 
    502 
    503 
    504     vzip.32     d6,d18
    505     vzip.32     d7,d19
    506 
    507     vmull.u8    q4,d1,d25                   @arithmetic operations for ii iteration in the same time
    508     vmlsl.u8    q4,d0,d24
    509     vmlsl.u8    q4,d2,d26
    510     vmlal.u8    q4,d3,d27
    511     vmlal.u8    q4,d4,d28
    512     vmlsl.u8    q4,d5,d29
    513     vmlal.u8    q4,d6,d30
    514     vmlsl.u8    q4,d7,d31
    515 
    516     vqrshrun.s16 d8,q4,#6                   @narrow right shift and saturating the result
    517     vst1.32     {d8[0]},[r1]!               @store the i iteration result which is in upper part of the register
    518     vst1.32     {d8[1]},[r6]!               @store the ii iteration result which is in lower part of the register
    519     subs        r5,r5,#4                    @decrement the wd by 4
    520     bgt         inner_loop_4
    521 
    522 end_inner_loop_4:
    523     subs        r14,r14,#2                  @decrement the ht by 4
    524     add         r12,r12,r9                  @increment the input pointer 2*src_strd-wd
    525     add         r1,r1,r8                    @increment the output pointer 2*dst_strd-wd
    526     bgt         outer_loop_4
    527     @subs   r7,r7,#1
    528     @ bgt   start_loop_count
    529 
    530     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    531 
    532 
    533 
    534 
    535 
    536 
    537 
    538