Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* @file
     21 @*  ihevc_intra_pred_luma_mode_27_to_33.s
     22 @*
     23 @* @brief
     24 @*  contains function definition for intra prediction  interpolation filters
     25 @*
     26 @*
     27 @* @author
     28 @*  parthiban v
     29 @*
     30 @* @par list of functions:
     31 @*  - ihevc_intra_pred_luma_mode_27_to_33()
     32 @*
     33 @* @remarks
     34 @*  none
     35 @*
     36 @*******************************************************************************
     37 @*/
     38 @
     39 @/**
     40 @*******************************************************************************
     41 @*
     42 @* @brief
     43 @*    intra prediction interpolation filter for luma mode 27 to mode 33
     44 @*
     45 @* @par description:
     46 @*    intraprediction for mode 27 to 33  (positive angle, vertical mode ) with
     47 @*   .extern  neighboring samples location pointed by 'pu1_ref' to the  tu
     48 @*    block location pointed by 'pu1_dst'
     49 @*
     50 @* @param[in] pu1_src
     51 @*  uword8 pointer to the source
     52 @*
     53 @* @param[out] pu1_dst
     54 @*  uword8 pointer to the destination
     55 @*
     56 @* @param[in] src_strd
     57 @*  integer source stride
     58 @*
     59 @* @param[in] dst_strd
     60 @*  integer destination stride
     61 @*
     62 @* @param[in] nt
     63 @*  integer transform block size
     64 @*
     65 @* @param[in] mode
     66 @*  integer intraprediction mode
     67 @*
     68 @* @returns
     69 @*
     70 @* @remarks
     71 @*  none
     72 @*
     73 @*******************************************************************************
     74 @*/
     75 
     76 @void ihevc_intra_pred_luma_mode_27_to_33(uword8 *pu1_ref,
     77 @                                       word32 src_strd,
     78 @                                       uword8 *pu1_dst,
     79 @                                       word32 dst_strd,
     80 @                                       word32 nt,
     81 @                                       word32 mode)
     82 @**************variables vs registers*****************************************
     83 @r0 => *pu1_ref
     84 @r1 =>  src_strd
     85 @r2 => *pu1_dst
     86 @r3 =>  dst_strd
     87 
     88 .equ    nt_offset,      104
     89 .equ    mode_offset,    108
     90 
     91 .text
     92 .align 4
     93 
     94 
     95 
     96 
     97 .globl ihevc_intra_pred_luma_mode_27_to_33_a9q
     98 .extern gai4_ihevc_ang_table
     99 .extern gau1_ihevc_planar_factor
    100 
    101 gai4_ihevc_ang_table_addr:
    102 .long gai4_ihevc_ang_table - ulbl1 - 8
    103 
    104 gau1_ihevc_planar_factor_addr:
    105 .long gau1_ihevc_planar_factor - ulbl2 - 8
    106 
    107 
    108 .type ihevc_intra_pred_luma_mode_27_to_33_a9q, %function
    109 
    110 ihevc_intra_pred_luma_mode_27_to_33_a9q:
    111 
    112     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    113     vpush       {d8 - d15}
    114     ldr         r4,[sp,#nt_offset]          @loads nt
    115     ldr         r5,[sp,#mode_offset]        @loads mode
    116     ldr         r6,gai4_ihevc_ang_table_addr @loads word32 gai4_ihevc_ang_table[35]
    117 ulbl1:
    118     add         r6,r6,pc
    119 
    120     lsl         r7,r4,#1                    @two_nt
    121 
    122     add         r8,r6,r5,lsl #2             @*gai4_ihevc_ang_table[mode]
    123     ldr         r9,[r8]                     @intra_pred_ang = gai4_ihevc_ang_table[mode]
    124     ldr         r1,gau1_ihevc_planar_factor_addr @used for ((row + 1) * intra_pred_ang) row values
    125 ulbl2:
    126     add         r1,r1,pc
    127     add         r6,r1,#1
    128 
    129     tst         r4,#7
    130     add         r8,r0,r7                    @pu1_ref + two_nt
    131     mov         lr,#0                       @row
    132     mov         r12,r4
    133     bne         core_loop_4
    134 
    135 core_loop_8:
    136     add         r8,r8,#1                    @pu1_ref_main_idx += (two_nt + 1)
    137     vdup.8      d0,r9                       @intra_pred_ang
    138     mov         r12,r4,lsr #3               @divide by 8
    139 
    140     vmov.i8     d1,#32
    141     mul         r7,r4,r12
    142 
    143     vmov.i16    q3,#31
    144     @lsl            r12,r3,#3
    145 
    146     mov         r1,r8
    147     @sub            r12,r12,r4
    148     mov         r5,r4
    149     mov         r11,#1
    150 
    151 prologue:
    152     vld1.8      {d3},[r6]                   @loads the row value
    153     vmull.u8    q1,d3,d0                    @pos = ((row + 1) * intra_pred_ang)
    154     vand        q2,q1,q3                    @dup_const_fract(fract = pos & (31))
    155     vmovn.i16   d4,q2
    156     vshrn.u16   d5,q1,#5                    @idx = pos >> 5
    157 
    158     vdup.8      d31,d4[0]
    159     add         r0,r2,r3
    160 
    161     vmov.u32    lr,d5[0]                    @(i row)extract idx to the r register
    162 
    163     vdup.8      d29,d4[1]                   @(ii)
    164     and         r9,lr,#0xff                 @(i row) get the last byte
    165 
    166     add         r10,r8,r9                   @(i row)*pu1_ref[ref_main_idx]
    167 
    168     asr         lr,lr,#8                    @(ii)shift by 8
    169     vld1.8      {d8},[r10],r11              @(i row)ref_main_idx
    170     and         r9,lr,#0xff                 @(ii)get the last byte
    171 
    172     asr         lr,lr,#8                    @(iii)
    173     vld1.8      {d9},[r10]                  @(i row)ref_main_idx_1
    174     add         r12,r8,r9                   @(ii)*pu1_ref[ref_main_idx]
    175 
    176     and         r9,lr,#0xff                 @(iii)
    177     vsub.u8     d30,d1,d31                  @32-fract(dup_const_32_fract)
    178     add         r10,r8,r9                   @(iii)*pu1_ref[ref_main_idx]
    179 
    180     vld1.8      {d12},[r12],r11             @(ii)ref_main_idx
    181     vmull.u8    q5,d8,d30                   @(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
    182 
    183     vld1.8      {d13},[r12]                 @(ii)ref_main_idx_1
    184     vmlal.u8    q5,d9,d31                   @(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
    185     asr         lr,lr,#8                    @(iv)
    186 
    187     vdup.8      d27,d4[2]                   @(iii)
    188     vsub.u8     d28,d1,d29                  @(ii)32-fract(dup_const_32_fract)
    189     and         r9,lr,#0xff                 @(iv)
    190 
    191     vdup.8      d25,d4[3]                   @(iv)
    192     vmull.u8    q7,d12,d28                  @(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
    193     add         r12,r8,r9                   @(iv)*pu1_ref[ref_main_idx]
    194 
    195     vld1.8      {d16},[r10],r11             @(iii)ref_main_idx
    196     vmlal.u8    q7,d13,d29                  @(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
    197 
    198     vld1.8      {d17},[r10]                 @(iii)ref_main_idx_1
    199     vrshrn.i16  d10,q5,#5                   @(i row)shift_res = vrshrn_n_u16(add_res, 5)
    200 
    201     vld1.8      {d20},[r12],r11             @(iv)ref_main_idx
    202     vsub.u8     d26,d1,d27                  @(iii)32-fract(dup_const_32_fract)
    203 
    204     vld1.8      {d21},[r12]                 @(iv)ref_main_idx_1
    205 
    206     vdup.8      d31,d4[4]                   @(v)
    207     vmull.u8    q9,d16,d26                  @(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
    208 
    209     vmov.u32    lr,d5[1]                    @extract idx to the r register
    210     vmlal.u8    q9,d17,d27                  @(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
    211 
    212     vst1.8      {d10},[r2]!                 @(i row)
    213     vrshrn.i16  d14,q7,#5                   @(ii)shift_res = vrshrn_n_u16(add_res, 5)
    214 
    215     and         r9,lr,#0xff                 @(v)
    216     vdup.8      d29,d4[5]                   @(vi)
    217     add         r10,r8,r9                   @(v)*pu1_ref[ref_main_idx]
    218 
    219     vld1.8      {d8},[r10],r11              @(v)ref_main_idx
    220     vsub.u8     d24,d1,d25                  @(iv)32-fract(dup_const_32_fract)
    221 
    222     asr         lr,lr,#8                    @(vi)
    223     vmull.u8    q11,d20,d24                 @(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
    224     and         r9,lr,#0xff                 @(vi)
    225 
    226     vld1.8      {d9},[r10]                  @(v)ref_main_idx_1
    227     vmlal.u8    q11,d21,d25                 @(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
    228 
    229     vst1.8      {d14},[r0],r3               @(ii)
    230     vrshrn.i16  d18,q9,#5                   @(iii)shift_res = vrshrn_n_u16(add_res, 5)
    231 
    232     add         r12,r8,r9                   @(vi)*pu1_ref[ref_main_idx]
    233     vdup.8      d27,d4[6]                   @(vii)
    234     asr         lr,lr,#8                    @(vii)
    235 
    236     and         r9,lr,#0xff                 @(vii)
    237     vsub.u8     d30,d1,d31                  @(v)32-fract(dup_const_32_fract)
    238     add         r10,r8,r9                   @(vii)*pu1_ref[ref_main_idx]
    239 
    240     vld1.8      {d12},[r12],r11             @(vi)ref_main_idx
    241     vmull.u8    q5,d8,d30                   @(v)vmull_u8(ref_main_idx, dup_const_32_fract)
    242 
    243     vld1.8      {d13},[r12]                 @(vi)ref_main_idx_1
    244     vmlal.u8    q5,d9,d31                   @(v)vmull_u8(ref_main_idx_1, dup_const_fract)
    245 
    246     vst1.8      {d18},[r0],r3               @(iii)
    247     vrshrn.i16  d22,q11,#5                  @(iv)shift_res = vrshrn_n_u16(add_res, 5)
    248 
    249     asr         lr,lr,#8                    @(viii)
    250     vdup.8      d25,d4[7]                   @(viii)
    251     and         r9,lr,#0xff                 @(viii)
    252 
    253     vld1.8      {d16},[r10],r11             @(vii)ref_main_idx
    254     vsub.u8     d28,d1,d29                  @(vi)32-fract(dup_const_32_fract)
    255 
    256     vld1.8      {d17},[r10]                 @(vii)ref_main_idx_1
    257     vmull.u8    q7,d12,d28                  @(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
    258 
    259     add         r12,r8,r9                   @(viii)*pu1_ref[ref_main_idx]
    260     vmlal.u8    q7,d13,d29                  @(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
    261     subs        r4,r4,#8
    262 
    263     vst1.8      {d22},[r0],r3               @(iv)
    264     vrshrn.i16  d10,q5,#5                   @(v)shift_res = vrshrn_n_u16(add_res, 5)
    265 
    266     vld1.8      {d20},[r12],r11             @(viii)ref_main_idx
    267     vsub.u8     d26,d1,d27                  @(vii)32-fract(dup_const_32_fract)
    268 
    269     vld1.8      {d21},[r12]                 @(viii)ref_main_idx_1
    270     vmull.u8    q9,d16,d26                  @(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
    271 
    272     addgt       r8,r8,#8
    273     vmlal.u8    q9,d17,d27                  @(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
    274     subgt       r7,r7,#8
    275 
    276     vst1.8      {d10},[r0],r3               @(v)
    277     vrshrn.i16  d14,q7,#5                   @(vi)shift_res = vrshrn_n_u16(add_res, 5)
    278 
    279     beq         epilogue
    280 
    281     vld1.8      {d5},[r6]                   @loads the row value
    282     vmull.u8    q1,d5,d0                    @pos = ((row + 1) * intra_pred_ang)
    283     vand        q2,q1,q3                    @dup_const_fract(fract = pos & (31))
    284     vmovn.i16   d4,q2
    285     vshrn.u16   d3,q1,#5                    @idx = pos >> 5
    286     vmov.u32    lr,d3[0]                    @(i)extract idx to the r register
    287     and         r9,lr,#0xff                 @(i)
    288     add         r10,r8,r9                   @(i)*pu1_ref[ref_main_idx]
    289 
    290 kernel_8_rows:
    291     asr         lr,lr,#8                    @(ii)
    292     vdup.8      d31,d4[0]
    293     subs        r4,r4,#8
    294 
    295     vld1.8      {d8},[r10],r11              @(i)ref_main_idx
    296     vsub.u8     d24,d1,d25                  @(viii)32-fract(dup_const_32_fract)
    297     and         r9,lr,#0xff                 @(ii)
    298     addle       r6,r6,#8                    @increment the row value
    299 
    300     vld1.8      {d9},[r10]                  @(i)ref_main_idx_1
    301     vmull.u8    q11,d20,d24                 @(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
    302     add         r12,r8,r9                   @(ii)*pu1_ref[ref_main_idx]
    303 
    304     vld1.8      {d5},[r6]                   @loads the row value
    305     vmlal.u8    q11,d21,d25                 @(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
    306     asr         lr,lr,#8                    @(iii)
    307 
    308     vdup.8      d29,d4[1]                   @(ii)
    309     vrshrn.i16  d18,q9,#5                   @(vii)shift_res = vrshrn_n_u16(add_res, 5)
    310     and         r9,lr,#0xff                 @(iii)
    311 
    312     vst1.8      {d14},[r0],r3               @(vi)
    313     vsub.u8     d30,d1,d31                  @(i)32-fract(dup_const_32_fract)
    314     add         r10,r8,r9                   @(iii)*pu1_ref[ref_main_idx]
    315 
    316     vld1.8      {d12},[r12],r11             @(ii)ref_main_idx
    317     vmull.u8    q5,d8,d30                   @(i)vmull_u8(ref_main_idx, dup_const_32_fract)
    318     asr         lr,lr,#8                    @(iv)
    319 
    320     vld1.8      {d13},[r12]                 @(ii)ref_main_idx_1
    321     vmlal.u8    q5,d9,d31                   @(i)vmull_u8(ref_main_idx_1, dup_const_fract)
    322     and         r9,lr,#0xff                 @(iv)
    323 
    324     vmov.u32    lr,d3[1]                    @extract idx to the r register
    325     vrshrn.i16  d22,q11,#5                  @(viii)shift_res = vrshrn_n_u16(add_res, 5)
    326 
    327     vdup.8      d27,d4[2]                   @(iii)
    328     vsub.u8     d28,d1,d29                  @(ii)32-fract(dup_const_32_fract)
    329     movle       r4,r5                       @reload nt
    330 
    331     vld1.8      {d16},[r10],r11             @(iii)ref_main_idx
    332     vmull.u8    q7,d12,d28                  @(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
    333     add         r12,r8,r9                   @(iv)*pu1_ref[ref_main_idx]
    334 
    335     vst1.8      {d18},[r0],r3               @(vii)
    336     vmlal.u8    q7,d13,d29                  @(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
    337 
    338     vld1.8      {d17},[r10]                 @(iii)ref_main_idx_1
    339     vrshrn.i16  d10,q5,#5                   @(i)shift_res = vrshrn_n_u16(add_res, 5)
    340 
    341     vdup.8      d25,d4[3]                   @(iv)
    342     vmull.u8    q1,d5,d0                    @pos = ((row + 1) * intra_pred_ang)
    343 
    344     vst1.8      {d22},[r0]                  @(viii)
    345     vsub.u8     d26,d1,d27                  @(iii)32-fract(dup_const_32_fract)
    346 
    347     vld1.8      {d20},[r12],r11             @(iv)ref_main_idx
    348     vmull.u8    q9,d16,d26                  @(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
    349     add         r0,r2,r3
    350 
    351     vld1.8      {d21},[r12]                 @(iv)ref_main_idx_1
    352     vmlal.u8    q9,d17,d27                  @(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
    353     and         r9,lr,#0xff                 @(v)
    354 
    355     vdup.8      d31,d4[4]                   @(v)
    356     vrshrn.i16  d14,q7,#5                   @(ii)shift_res = vrshrn_n_u16(add_res, 5)
    357     add         r10,r8,r9                   @(v)*pu1_ref[ref_main_idx]
    358 
    359     vst1.8      {d10},[r2]!                 @(i)
    360     vsub.u8     d24,d1,d25                  @(iv)32-fract(dup_const_32_fract)
    361     asr         lr,lr,#8                    @(vi)
    362 
    363     vdup.8      d29,d4[5]                   @(vi)
    364     vmull.u8    q11,d20,d24                 @(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
    365     and         r9,lr,#0xff                 @(vi)
    366 
    367     vdup.8      d27,d4[6]                   @(vii)
    368     vmlal.u8    q11,d21,d25                 @(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
    369     add         r12,r8,r9                   @(vi)*pu1_ref[ref_main_idx]
    370 
    371     vdup.8      d25,d4[7]                   @(viii)
    372     vrshrn.i16  d18,q9,#5                   @(iii)shift_res = vrshrn_n_u16(add_res, 5)
    373     asr         lr,lr,#8                    @(vii)
    374 
    375     vld1.8      {d8},[r10],r11              @(v)ref_main_idx
    376     vand        q2,q1,q3                    @dup_const_fract(fract = pos & (31))
    377     and         r9,lr,#0xff                 @(vii)
    378 
    379     vld1.8      {d9},[r10]                  @(v)ref_main_idx_1
    380     vshrn.u16   d3,q1,#5                    @idx = pos >> 5
    381     asr         lr,lr,#8                    @(viii)
    382 
    383     vst1.8      {d14},[r0],r3               @(ii)
    384     vrshrn.i16  d22,q11,#5                  @(iv)shift_res = vrshrn_n_u16(add_res, 5)
    385     add         r10,r8,r9                   @(vii)*pu1_ref[ref_main_idx]
    386 
    387     vld1.8      {d12},[r12],r11             @(vi)ref_main_idx
    388     vsub.u8     d30,d1,d31                  @(v)32-fract(dup_const_32_fract)
    389     and         r9,lr,#0xff                 @(viii)
    390 
    391     vld1.8      {d13},[r12]                 @(vi)ref_main_idx_1
    392     vmull.u8    q5,d8,d30                   @(v)vmull_u8(ref_main_idx, dup_const_32_fract)
    393 
    394     vmov.u32    lr,d3[0]                    @(i)extract idx to the r register
    395     vmlal.u8    q5,d9,d31                   @(v)vmull_u8(ref_main_idx_1, dup_const_fract)
    396     add         r12,r8,r9                   @(viii)*pu1_ref[ref_main_idx]
    397 
    398     vld1.8      {d16},[r10],r11             @(vii)ref_main_idx
    399     vsub.u8     d28,d1,d29                  @(vi)32-fract(dup_const_32_fract)
    400 
    401     vst1.8      {d18},[r0],r3               @(iii)
    402     vmull.u8    q7,d12,d28                  @(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
    403     movle       r8,r1                       @reload the source to pu1_src+2nt
    404 
    405     vld1.8      {d17},[r10]                 @(vii)ref_main_idx_1
    406     vmlal.u8    q7,d13,d29                  @(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
    407     addgt       r8,r8,#8                    @increment the source next set 8 columns in same row
    408 
    409     vld1.8      {d20},[r12],r11             @(viii)ref_main_idx
    410     vrshrn.i16  d10,q5,#5                   @(v)shift_res = vrshrn_n_u16(add_res, 5)
    411 
    412     vld1.8      {d21},[r12]                 @(viii)ref_main_idx_1
    413     vsub.u8     d26,d1,d27                  @(vii)32-fract(dup_const_32_fract)
    414     lslle       r12,r3,#3
    415 
    416     vst1.8      {d22},[r0],r3               @(iv)
    417     vmull.u8    q9,d16,d26                  @(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
    418     suble       r12,r12,r5
    419 
    420     vst1.8      {d10},[r0],r3               @(v)
    421     vmlal.u8    q9,d17,d27                  @(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
    422     addle       r2,r2,r12                   @increment the dst pointer to 8*dst_strd - nt
    423 
    424     vmovn.i16   d4,q2
    425     vrshrn.i16  d14,q7,#5                   @(vi)shift_res = vrshrn_n_u16(add_res, 5)
    426     and         r9,lr,#0xff                 @(i)
    427 
    428     subs        r7,r7,#8
    429     add         r10,r8,r9                   @(i)*pu1_ref[ref_main_idx]
    430 
    431     bne         kernel_8_rows
    432 
    433 epilogue:
    434     vst1.8      {d14},[r0],r3               @(vi)
    435     vrshrn.i16  d18,q9,#5                   @(vii)shift_res = vrshrn_n_u16(add_res, 5)
    436 
    437     vsub.u8     d24,d1,d25                  @(viii)32-fract(dup_const_32_fract)
    438     vmull.u8    q11,d20,d24                 @(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
    439     vmlal.u8    q11,d21,d25                 @(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
    440 
    441     vst1.8      {d18},[r0],r3               @(vii)
    442     vrshrn.i16  d22,q11,#5                  @(viii)shift_res = vrshrn_n_u16(add_res, 5)
    443 
    444     vst1.8      {d22},[r0],r3               @(viii)
    445     b           end_loops
    446 
    447 core_loop_4:
    448     add         r10,r8,#1                   @pu1_ref_main_idx += (two_nt + 1)
    449     add         r11,r8,#2                   @pu1_ref_main_idx_1 += (two_nt + 2)
    450     mov         r8,#0
    451 
    452     add         r5,r8,#1                    @row + 1
    453     mul         r5,r5,r9                    @pos = ((row + 1) * intra_pred_ang)
    454     and         r5,r5,#31                   @fract = pos & (31)
    455     cmp         lr,r5                       @if(fract_prev > fract)
    456     addgt       r10,r10,#1                  @pu1_ref_main_idx += 1
    457     add         r11,r10,#1                  @pu1_ref_main_idx_1 += 1
    458     vdup.8      d0,r5                       @dup_const_fract
    459     rsb         r4,r5,#32
    460     vdup.8      d1,r4                       @dup_const_32_fract
    461 
    462 @inner_loop_4
    463     vld1.32     {d2[0]},[r10]               @ref_main_idx
    464     add         r8,r8,#1
    465     mov         lr,r5                       @fract_prev = fract
    466 
    467     vld1.32     {d3[0]},[r11]               @ref_main_idx_1
    468     add         r5,r8,#1                    @row + 1
    469     mul         r5,r5,r9                    @pos = ((row + 1) * intra_pred_ang)
    470     and         r5,r5,#31                   @fract = pos & (31)
    471     cmp         lr,r5                       @if(fract_prev > fract)
    472     addgt       r10,r10,#1                  @pu1_ref_main_idx += 1
    473     add         r11,r10,#1                  @pu1_ref_main_idx_1 += 1
    474 
    475     vdup.8      d6,r5                       @dup_const_fract
    476     vmull.u8    q2,d2,d1                    @vmull_u8(ref_main_idx, dup_const_32_fract)
    477 
    478     rsb         r4,r5,#32
    479     vdup.8      d7,r4                       @dup_const_32_fract
    480     vmlal.u8    q2,d3,d0                    @vmull_u8(ref_main_idx_1, dup_const_fract)
    481 
    482     vld1.32     {d8[0]},[r10]               @ref_main_idx
    483     add         r8,r8,#1
    484 
    485     vld1.32     {d9[0]},[r11]               @ref_main_idx_1
    486     vrshrn.i16  d4,q2,#5                    @shift_res = vrshrn_n_u16(add_res, 5)
    487 
    488     mov         lr,r5                       @fract_prev = fract
    489     add         r5,r8,#1                    @row + 1
    490     mul         r5,r5,r9                    @pos = ((row + 1) * intra_pred_ang)
    491     and         r5,r5,#31                   @fract = pos & (31)
    492     cmp         lr,r5                       @if(fract_prev > fract)
    493     addgt       r10,r10,#1                  @pu1_ref_main_idx += 1
    494     add         r11,r10,#1                  @pu1_ref_main_idx_1 += 1
    495 
    496     vdup.8      d12,r5                      @dup_const_fract
    497     vmull.u8    q5,d8,d7                    @vmull_u8(ref_main_idx, dup_const_32_fract)
    498 
    499     rsb         r4,r5,#32
    500     vdup.8      d13,r4                      @dup_const_32_fract
    501     vmlal.u8    q5,d9,d6                    @vmull_u8(ref_main_idx_1, dup_const_fract)
    502 
    503     vld1.32     {d14[0]},[r10]              @ref_main_idx
    504     add         r8,r8,#1
    505 
    506     vst1.32     {d4[0]},[r2],r3
    507     vrshrn.i16  d10,q5,#5                   @shift_res = vrshrn_n_u16(add_res, 5)
    508 
    509     vld1.32     {d15[0]},[r11]              @ref_main_idx_1
    510     mov         lr,r5                       @fract_prev = fract
    511     add         r5,r8,#1                    @row + 1
    512     mul         r5,r5,r9                    @pos = ((row + 1) * intra_pred_ang)
    513     and         r5,r5,#31                   @fract = pos & (31)
    514     cmp         lr,r5                       @if(fract_prev > fract)
    515     addgt       r10,r10,#1                  @pu1_ref_main_idx += 1
    516     add         r11,r10,#1                  @pu1_ref_main_idx_1 += 1
    517 
    518     vdup.8      d18,r5                      @dup_const_fract
    519     vmull.u8    q8,d14,d13                  @vmull_u8(ref_main_idx, dup_const_32_fract)
    520 
    521     rsb         r4,r5,#32
    522     vdup.8      d19,r4                      @dup_const_32_fract
    523     vmlal.u8    q8,d15,d12                  @vmull_u8(ref_main_idx_1, dup_const_fract)
    524 
    525     vld1.32     {d20[0]},[r10]              @ref_main_idx
    526 
    527     vst1.32     {d10[0]},[r2],r3
    528     vrshrn.i16  d16,q8,#5                   @shift_res = vrshrn_n_u16(add_res, 5)
    529     vld1.32     {d21[0]},[r11]              @ref_main_idx_1
    530 
    531     vmull.u8    q11,d20,d19                 @vmull_u8(ref_main_idx, dup_const_32_fract)
    532     vmlal.u8    q11,d21,d18                 @vmull_u8(ref_main_idx_1, dup_const_fract)
    533 
    534     vst1.32     {d16[0]},[r2],r3
    535     vrshrn.i16  d22,q11,#5                  @shift_res = vrshrn_n_u16(add_res, 5)
    536 
    537     vst1.32     {d22[0]},[r2],r3
    538 
    539 end_loops:
    540     vpop        {d8 - d15}
    541     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    542 
    543 
    544 
    545