Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* @file
     21 @*  ihevc_intra_pred_luma_mode_27_to_33.s
     22 @*
     23 @* @brief
     24 @*  contains function definition for intra prediction  interpolation filters
     25 @*
     26 @*
     27 @* @author
     28 @*  parthiban v
     29 @*
     30 @* @par list of functions:
     31 @*  - ihevc_intra_pred_luma_mode_27_to_33()
     32 @*
     33 @* @remarks
     34 @*  none
     35 @*
     36 @*******************************************************************************
     37 @*/
     38 @
     39 @/**
     40 @*******************************************************************************
     41 @*
     42 @* @brief
     43 @*    intra prediction interpolation filter for luma mode 27 to mode 33
     44 @*
     45 @* @par description:
     46 @*    intraprediction for mode 27 to 33  (positive angle, vertical mode ) with
     47 @*   .extern  neighboring samples location pointed by 'pu1_ref' to the  tu
     48 @*    block location pointed by 'pu1_dst'
     49 @*
     50 @* @param[in] pu1_src
     51 @*  uword8 pointer to the source
     52 @*
     53 @* @param[out] pu1_dst
     54 @*  uword8 pointer to the destination
     55 @*
     56 @* @param[in] src_strd
     57 @*  integer source stride
     58 @*
     59 @* @param[in] dst_strd
     60 @*  integer destination stride
     61 @*
     62 @* @param[in] nt
     63 @*  integer transform block size
     64 @*
     65 @* @param[in] mode
     66 @*  integer intraprediction mode
     67 @*
     68 @* @returns
     69 @*
     70 @* @remarks
     71 @*  none
     72 @*
     73 @*******************************************************************************
     74 @*/
     75 
     76 @void ihevc_intra_pred_luma_mode_27_to_33(uword8 *pu1_ref,
     77 @                                       word32 src_strd,
     78 @                                       uword8 *pu1_dst,
     79 @                                       word32 dst_strd,
     80 @                                       word32 nt,
     81 @                                       word32 mode)
     82 @**************variables vs registers*****************************************
     83 @r0 => *pu1_ref
     84 @r1 =>  src_strd
     85 @r2 => *pu1_dst
     86 @r3 =>  dst_strd
     87 
     88 .text
     89 .align 4
     90 
     91 
     92 
     93 
     94 .globl ihevc_intra_pred_luma_mode_27_to_33_a9q
     95 .extern gai4_ihevc_ang_table
     96 .extern gau1_ihevc_planar_factor
     97 
     98 gai4_ihevc_ang_table_addr:
     99 .long gai4_ihevc_ang_table - ulbl1 - 8
    100 
    101 gau1_ihevc_planar_factor_addr:
    102 .long gau1_ihevc_planar_factor - ulbl2 - 8
    103 
    104 
    105 .type ihevc_intra_pred_luma_mode_27_to_33_a9q, %function
    106 
    107 ihevc_intra_pred_luma_mode_27_to_33_a9q:
    108 
    109     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    110 
    111     ldr         r4,[sp,#40]                 @loads nt
    112     ldr         r5,[sp,#44]                 @loads mode
    113     ldr         r6,gai4_ihevc_ang_table_addr @loads word32 gai4_ihevc_ang_table[35]
    114 ulbl1:
    115     add         r6,r6,pc
    116 
    117     lsl         r7,r4,#1                    @two_nt
    118 
    119     add         r8,r6,r5,lsl #2             @*gai4_ihevc_ang_table[mode]
    120     ldr         r9,[r8]                     @intra_pred_ang = gai4_ihevc_ang_table[mode]
    121     ldr         r1,gau1_ihevc_planar_factor_addr @used for ((row + 1) * intra_pred_ang) row values
    122 ulbl2:
    123     add         r1,r1,pc
    124     add         r6,r1,#1
    125 
    126     tst         r4,#7
    127     add         r8,r0,r7                    @pu1_ref + two_nt
    128     mov         lr,#0                       @row
    129     mov         r12,r4
    130     bne         core_loop_4
    131 
    132 core_loop_8:
    133     add         r8,r8,#1                    @pu1_ref_main_idx += (two_nt + 1)
    134     vdup.8      d0,r9                       @intra_pred_ang
    135     mov         r12,r4,lsr #3               @divide by 8
    136 
    137     vmov.i8     d1,#32
    138     mul         r7,r4,r12
    139 
    140     vmov.i16    q3,#31
    141     @lsl            r12,r3,#3
    142 
    143     mov         r1,r8
    144     @sub            r12,r12,r4
    145     mov         r5,r4
    146     mov         r11,#1
    147 
    148 prologue:
    149     vld1.8      {d3},[r6]                   @loads the row value
    150     vmull.u8    q1,d3,d0                    @pos = ((row + 1) * intra_pred_ang)
    151     vand        q2,q1,q3                    @dup_const_fract(fract = pos & (31))
    152     vmovn.i16   d4,q2
    153     vshrn.u16   d5,q1,#5                    @idx = pos >> 5
    154 
    155     vdup.8      d31,d4[0]
    156     add         r0,r2,r3
    157 
    158     vmov.u32    lr,d5[0]                    @(i row)extract idx to the r register
    159 
    160     vdup.8      d29,d4[1]                   @(ii)
    161     and         r9,lr,#0xff                 @(i row) get the last byte
    162 
    163     add         r10,r8,r9                   @(i row)*pu1_ref[ref_main_idx]
    164 
    165     asr         lr,lr,#8                    @(ii)shift by 8
    166     vld1.8      {d8},[r10],r11              @(i row)ref_main_idx
    167     and         r9,lr,#0xff                 @(ii)get the last byte
    168 
    169     asr         lr,lr,#8                    @(iii)
    170     vld1.8      {d9},[r10]                  @(i row)ref_main_idx_1
    171     add         r12,r8,r9                   @(ii)*pu1_ref[ref_main_idx]
    172 
    173     and         r9,lr,#0xff                 @(iii)
    174     vsub.u8     d30,d1,d31                  @32-fract(dup_const_32_fract)
    175     add         r10,r8,r9                   @(iii)*pu1_ref[ref_main_idx]
    176 
    177     vld1.8      {d12},[r12],r11             @(ii)ref_main_idx
    178     vmull.u8    q5,d8,d30                   @(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
    179 
    180     vld1.8      {d13},[r12]                 @(ii)ref_main_idx_1
    181     vmlal.u8    q5,d9,d31                   @(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
    182     asr         lr,lr,#8                    @(iv)
    183 
    184     vdup.8      d27,d4[2]                   @(iii)
    185     vsub.u8     d28,d1,d29                  @(ii)32-fract(dup_const_32_fract)
    186     and         r9,lr,#0xff                 @(iv)
    187 
    188     vdup.8      d25,d4[3]                   @(iv)
    189     vmull.u8    q7,d12,d28                  @(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
    190     add         r12,r8,r9                   @(iv)*pu1_ref[ref_main_idx]
    191 
    192     vld1.8      {d16},[r10],r11             @(iii)ref_main_idx
    193     vmlal.u8    q7,d13,d29                  @(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
    194 
    195     vld1.8      {d17},[r10]                 @(iii)ref_main_idx_1
    196     vrshrn.i16  d10,q5,#5                   @(i row)shift_res = vrshrn_n_u16(add_res, 5)
    197 
    198     vld1.8      {d20},[r12],r11             @(iv)ref_main_idx
    199     vsub.u8     d26,d1,d27                  @(iii)32-fract(dup_const_32_fract)
    200 
    201     vld1.8      {d21},[r12]                 @(iv)ref_main_idx_1
    202 
    203     vdup.8      d31,d4[4]                   @(v)
    204     vmull.u8    q9,d16,d26                  @(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
    205 
    206     vmov.u32    lr,d5[1]                    @extract idx to the r register
    207     vmlal.u8    q9,d17,d27                  @(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
    208 
    209     vst1.8      {d10},[r2]!                 @(i row)
    210     vrshrn.i16  d14,q7,#5                   @(ii)shift_res = vrshrn_n_u16(add_res, 5)
    211 
    212     and         r9,lr,#0xff                 @(v)
    213     vdup.8      d29,d4[5]                   @(vi)
    214     add         r10,r8,r9                   @(v)*pu1_ref[ref_main_idx]
    215 
    216     vld1.8      {d8},[r10],r11              @(v)ref_main_idx
    217     vsub.u8     d24,d1,d25                  @(iv)32-fract(dup_const_32_fract)
    218 
    219     asr         lr,lr,#8                    @(vi)
    220     vmull.u8    q11,d20,d24                 @(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
    221     and         r9,lr,#0xff                 @(vi)
    222 
    223     vld1.8      {d9},[r10]                  @(v)ref_main_idx_1
    224     vmlal.u8    q11,d21,d25                 @(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
    225 
    226     vst1.8      {d14},[r0],r3               @(ii)
    227     vrshrn.i16  d18,q9,#5                   @(iii)shift_res = vrshrn_n_u16(add_res, 5)
    228 
    229     add         r12,r8,r9                   @(vi)*pu1_ref[ref_main_idx]
    230     vdup.8      d27,d4[6]                   @(vii)
    231     asr         lr,lr,#8                    @(vii)
    232 
    233     and         r9,lr,#0xff                 @(vii)
    234     vsub.u8     d30,d1,d31                  @(v)32-fract(dup_const_32_fract)
    235     add         r10,r8,r9                   @(vii)*pu1_ref[ref_main_idx]
    236 
    237     vld1.8      {d12},[r12],r11             @(vi)ref_main_idx
    238     vmull.u8    q5,d8,d30                   @(v)vmull_u8(ref_main_idx, dup_const_32_fract)
    239 
    240     vld1.8      {d13},[r12]                 @(vi)ref_main_idx_1
    241     vmlal.u8    q5,d9,d31                   @(v)vmull_u8(ref_main_idx_1, dup_const_fract)
    242 
    243     vst1.8      {d18},[r0],r3               @(iii)
    244     vrshrn.i16  d22,q11,#5                  @(iv)shift_res = vrshrn_n_u16(add_res, 5)
    245 
    246     asr         lr,lr,#8                    @(viii)
    247     vdup.8      d25,d4[7]                   @(viii)
    248     and         r9,lr,#0xff                 @(viii)
    249 
    250     vld1.8      {d16},[r10],r11             @(vii)ref_main_idx
    251     vsub.u8     d28,d1,d29                  @(vi)32-fract(dup_const_32_fract)
    252 
    253     vld1.8      {d17},[r10]                 @(vii)ref_main_idx_1
    254     vmull.u8    q7,d12,d28                  @(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
    255 
    256     add         r12,r8,r9                   @(viii)*pu1_ref[ref_main_idx]
    257     vmlal.u8    q7,d13,d29                  @(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
    258     subs        r4,r4,#8
    259 
    260     vst1.8      {d22},[r0],r3               @(iv)
    261     vrshrn.i16  d10,q5,#5                   @(v)shift_res = vrshrn_n_u16(add_res, 5)
    262 
    263     vld1.8      {d20},[r12],r11             @(viii)ref_main_idx
    264     vsub.u8     d26,d1,d27                  @(vii)32-fract(dup_const_32_fract)
    265 
    266     vld1.8      {d21},[r12]                 @(viii)ref_main_idx_1
    267     vmull.u8    q9,d16,d26                  @(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
    268 
    269     addgt       r8,r8,#8
    270     vmlal.u8    q9,d17,d27                  @(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
    271     subgt       r7,r7,#8
    272 
    273     vst1.8      {d10},[r0],r3               @(v)
    274     vrshrn.i16  d14,q7,#5                   @(vi)shift_res = vrshrn_n_u16(add_res, 5)
    275 
    276     beq         epilogue
    277 
    278     vld1.8      {d5},[r6]                   @loads the row value
    279     vmull.u8    q1,d5,d0                    @pos = ((row + 1) * intra_pred_ang)
    280     vand        q2,q1,q3                    @dup_const_fract(fract = pos & (31))
    281     vmovn.i16   d4,q2
    282     vshrn.u16   d3,q1,#5                    @idx = pos >> 5
    283     vmov.u32    lr,d3[0]                    @(i)extract idx to the r register
    284     and         r9,lr,#0xff                 @(i)
    285     add         r10,r8,r9                   @(i)*pu1_ref[ref_main_idx]
    286 
    287 kernel_8_rows:
    288     asr         lr,lr,#8                    @(ii)
    289     vdup.8      d31,d4[0]
    290     subs        r4,r4,#8
    291 
    292     vld1.8      {d8},[r10],r11              @(i)ref_main_idx
    293     vsub.u8     d24,d1,d25                  @(viii)32-fract(dup_const_32_fract)
    294     and         r9,lr,#0xff                 @(ii)
    295     addle       r6,r6,#8                    @increment the row value
    296 
    297     vld1.8      {d9},[r10]                  @(i)ref_main_idx_1
    298     vmull.u8    q11,d20,d24                 @(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
    299     add         r12,r8,r9                   @(ii)*pu1_ref[ref_main_idx]
    300 
    301     vld1.8      {d5},[r6]                   @loads the row value
    302     vmlal.u8    q11,d21,d25                 @(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
    303     asr         lr,lr,#8                    @(iii)
    304 
    305     vdup.8      d29,d4[1]                   @(ii)
    306     vrshrn.i16  d18,q9,#5                   @(vii)shift_res = vrshrn_n_u16(add_res, 5)
    307     and         r9,lr,#0xff                 @(iii)
    308 
    309     vst1.8      {d14},[r0],r3               @(vi)
    310     vsub.u8     d30,d1,d31                  @(i)32-fract(dup_const_32_fract)
    311     add         r10,r8,r9                   @(iii)*pu1_ref[ref_main_idx]
    312 
    313     vld1.8      {d12},[r12],r11             @(ii)ref_main_idx
    314     vmull.u8    q5,d8,d30                   @(i)vmull_u8(ref_main_idx, dup_const_32_fract)
    315     asr         lr,lr,#8                    @(iv)
    316 
    317     vld1.8      {d13},[r12]                 @(ii)ref_main_idx_1
    318     vmlal.u8    q5,d9,d31                   @(i)vmull_u8(ref_main_idx_1, dup_const_fract)
    319     and         r9,lr,#0xff                 @(iv)
    320 
    321     vmov.u32    lr,d3[1]                    @extract idx to the r register
    322     vrshrn.i16  d22,q11,#5                  @(viii)shift_res = vrshrn_n_u16(add_res, 5)
    323 
    324     vdup.8      d27,d4[2]                   @(iii)
    325     vsub.u8     d28,d1,d29                  @(ii)32-fract(dup_const_32_fract)
    326     movle       r4,r5                       @reload nt
    327 
    328     vld1.8      {d16},[r10],r11             @(iii)ref_main_idx
    329     vmull.u8    q7,d12,d28                  @(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
    330     add         r12,r8,r9                   @(iv)*pu1_ref[ref_main_idx]
    331 
    332     vst1.8      {d18},[r0],r3               @(vii)
    333     vmlal.u8    q7,d13,d29                  @(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
    334 
    335     vld1.8      {d17},[r10]                 @(iii)ref_main_idx_1
    336     vrshrn.i16  d10,q5,#5                   @(i)shift_res = vrshrn_n_u16(add_res, 5)
    337 
    338     vdup.8      d25,d4[3]                   @(iv)
    339     vmull.u8    q1,d5,d0                    @pos = ((row + 1) * intra_pred_ang)
    340 
    341     vst1.8      {d22},[r0]                  @(viii)
    342     vsub.u8     d26,d1,d27                  @(iii)32-fract(dup_const_32_fract)
    343 
    344     vld1.8      {d20},[r12],r11             @(iv)ref_main_idx
    345     vmull.u8    q9,d16,d26                  @(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
    346     add         r0,r2,r3
    347 
    348     vld1.8      {d21},[r12]                 @(iv)ref_main_idx_1
    349     vmlal.u8    q9,d17,d27                  @(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
    350     and         r9,lr,#0xff                 @(v)
    351 
    352     vdup.8      d31,d4[4]                   @(v)
    353     vrshrn.i16  d14,q7,#5                   @(ii)shift_res = vrshrn_n_u16(add_res, 5)
    354     add         r10,r8,r9                   @(v)*pu1_ref[ref_main_idx]
    355 
    356     vst1.8      {d10},[r2]!                 @(i)
    357     vsub.u8     d24,d1,d25                  @(iv)32-fract(dup_const_32_fract)
    358     asr         lr,lr,#8                    @(vi)
    359 
    360     vdup.8      d29,d4[5]                   @(vi)
    361     vmull.u8    q11,d20,d24                 @(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
    362     and         r9,lr,#0xff                 @(vi)
    363 
    364     vdup.8      d27,d4[6]                   @(vii)
    365     vmlal.u8    q11,d21,d25                 @(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
    366     add         r12,r8,r9                   @(vi)*pu1_ref[ref_main_idx]
    367 
    368     vdup.8      d25,d4[7]                   @(viii)
    369     vrshrn.i16  d18,q9,#5                   @(iii)shift_res = vrshrn_n_u16(add_res, 5)
    370     asr         lr,lr,#8                    @(vii)
    371 
    372     vld1.8      {d8},[r10],r11              @(v)ref_main_idx
    373     vand        q2,q1,q3                    @dup_const_fract(fract = pos & (31))
    374     and         r9,lr,#0xff                 @(vii)
    375 
    376     vld1.8      {d9},[r10]                  @(v)ref_main_idx_1
    377     vshrn.u16   d3,q1,#5                    @idx = pos >> 5
    378     asr         lr,lr,#8                    @(viii)
    379 
    380     vst1.8      {d14},[r0],r3               @(ii)
    381     vrshrn.i16  d22,q11,#5                  @(iv)shift_res = vrshrn_n_u16(add_res, 5)
    382     add         r10,r8,r9                   @(vii)*pu1_ref[ref_main_idx]
    383 
    384     vld1.8      {d12},[r12],r11             @(vi)ref_main_idx
    385     vsub.u8     d30,d1,d31                  @(v)32-fract(dup_const_32_fract)
    386     and         r9,lr,#0xff                 @(viii)
    387 
    388     vld1.8      {d13},[r12]                 @(vi)ref_main_idx_1
    389     vmull.u8    q5,d8,d30                   @(v)vmull_u8(ref_main_idx, dup_const_32_fract)
    390 
    391     vmov.u32    lr,d3[0]                    @(i)extract idx to the r register
    392     vmlal.u8    q5,d9,d31                   @(v)vmull_u8(ref_main_idx_1, dup_const_fract)
    393     add         r12,r8,r9                   @(viii)*pu1_ref[ref_main_idx]
    394 
    395     vld1.8      {d16},[r10],r11             @(vii)ref_main_idx
    396     vsub.u8     d28,d1,d29                  @(vi)32-fract(dup_const_32_fract)
    397 
    398     vst1.8      {d18},[r0],r3               @(iii)
    399     vmull.u8    q7,d12,d28                  @(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
    400     movle       r8,r1                       @reload the source to pu1_src+2nt
    401 
    402     vld1.8      {d17},[r10]                 @(vii)ref_main_idx_1
    403     vmlal.u8    q7,d13,d29                  @(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
    404     addgt       r8,r8,#8                    @increment the source next set 8 columns in same row
    405 
    406     vld1.8      {d20},[r12],r11             @(viii)ref_main_idx
    407     vrshrn.i16  d10,q5,#5                   @(v)shift_res = vrshrn_n_u16(add_res, 5)
    408 
    409     vld1.8      {d21},[r12]                 @(viii)ref_main_idx_1
    410     vsub.u8     d26,d1,d27                  @(vii)32-fract(dup_const_32_fract)
    411     lslle       r12,r3,#3
    412 
    413     vst1.8      {d22},[r0],r3               @(iv)
    414     vmull.u8    q9,d16,d26                  @(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
    415     suble       r12,r12,r5
    416 
    417     vst1.8      {d10},[r0],r3               @(v)
    418     vmlal.u8    q9,d17,d27                  @(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
    419     addle       r2,r2,r12                   @increment the dst pointer to 8*dst_strd - nt
    420 
    421     vmovn.i16   d4,q2
    422     vrshrn.i16  d14,q7,#5                   @(vi)shift_res = vrshrn_n_u16(add_res, 5)
    423     and         r9,lr,#0xff                 @(i)
    424 
    425     subs        r7,r7,#8
    426     add         r10,r8,r9                   @(i)*pu1_ref[ref_main_idx]
    427 
    428     bne         kernel_8_rows
    429 
    430 epilogue:
    431     vst1.8      {d14},[r0],r3               @(vi)
    432     vrshrn.i16  d18,q9,#5                   @(vii)shift_res = vrshrn_n_u16(add_res, 5)
    433 
    434     vsub.u8     d24,d1,d25                  @(viii)32-fract(dup_const_32_fract)
    435     vmull.u8    q11,d20,d24                 @(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
    436     vmlal.u8    q11,d21,d25                 @(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
    437 
    438     vst1.8      {d18},[r0],r3               @(vii)
    439     vrshrn.i16  d22,q11,#5                  @(viii)shift_res = vrshrn_n_u16(add_res, 5)
    440 
    441     vst1.8      {d22},[r0],r3               @(viii)
    442     b           end_loops
    443 
    444 core_loop_4:
    445     add         r10,r8,#1                   @pu1_ref_main_idx += (two_nt + 1)
    446     add         r11,r8,#2                   @pu1_ref_main_idx_1 += (two_nt + 2)
    447     mov         r8,#0
    448 
    449     add         r5,r8,#1                    @row + 1
    450     mul         r5,r5,r9                    @pos = ((row + 1) * intra_pred_ang)
    451     and         r5,r5,#31                   @fract = pos & (31)
    452     cmp         lr,r5                       @if(fract_prev > fract)
    453     addgt       r10,r10,#1                  @pu1_ref_main_idx += 1
    454     add         r11,r10,#1                  @pu1_ref_main_idx_1 += 1
    455     vdup.8      d0,r5                       @dup_const_fract
    456     rsb         r4,r5,#32
    457     vdup.8      d1,r4                       @dup_const_32_fract
    458 
    459 @inner_loop_4
    460     vld1.32     {d2[0]},[r10]               @ref_main_idx
    461     add         r8,r8,#1
    462     mov         lr,r5                       @fract_prev = fract
    463 
    464     vld1.32     {d3[0]},[r11]               @ref_main_idx_1
    465     add         r5,r8,#1                    @row + 1
    466     mul         r5,r5,r9                    @pos = ((row + 1) * intra_pred_ang)
    467     and         r5,r5,#31                   @fract = pos & (31)
    468     cmp         lr,r5                       @if(fract_prev > fract)
    469     addgt       r10,r10,#1                  @pu1_ref_main_idx += 1
    470     add         r11,r10,#1                  @pu1_ref_main_idx_1 += 1
    471 
    472     vdup.8      d6,r5                       @dup_const_fract
    473     vmull.u8    q2,d2,d1                    @vmull_u8(ref_main_idx, dup_const_32_fract)
    474 
    475     rsb         r4,r5,#32
    476     vdup.8      d7,r4                       @dup_const_32_fract
    477     vmlal.u8    q2,d3,d0                    @vmull_u8(ref_main_idx_1, dup_const_fract)
    478 
    479     vld1.32     {d8[0]},[r10]               @ref_main_idx
    480     add         r8,r8,#1
    481 
    482     vld1.32     {d9[0]},[r11]               @ref_main_idx_1
    483     vrshrn.i16  d4,q2,#5                    @shift_res = vrshrn_n_u16(add_res, 5)
    484 
    485     mov         lr,r5                       @fract_prev = fract
    486     add         r5,r8,#1                    @row + 1
    487     mul         r5,r5,r9                    @pos = ((row + 1) * intra_pred_ang)
    488     and         r5,r5,#31                   @fract = pos & (31)
    489     cmp         lr,r5                       @if(fract_prev > fract)
    490     addgt       r10,r10,#1                  @pu1_ref_main_idx += 1
    491     add         r11,r10,#1                  @pu1_ref_main_idx_1 += 1
    492 
    493     vdup.8      d12,r5                      @dup_const_fract
    494     vmull.u8    q5,d8,d7                    @vmull_u8(ref_main_idx, dup_const_32_fract)
    495 
    496     rsb         r4,r5,#32
    497     vdup.8      d13,r4                      @dup_const_32_fract
    498     vmlal.u8    q5,d9,d6                    @vmull_u8(ref_main_idx_1, dup_const_fract)
    499 
    500     vld1.32     {d14[0]},[r10]              @ref_main_idx
    501     add         r8,r8,#1
    502 
    503     vst1.32     {d4[0]},[r2],r3
    504     vrshrn.i16  d10,q5,#5                   @shift_res = vrshrn_n_u16(add_res, 5)
    505 
    506     vld1.32     {d15[0]},[r11]              @ref_main_idx_1
    507     mov         lr,r5                       @fract_prev = fract
    508     add         r5,r8,#1                    @row + 1
    509     mul         r5,r5,r9                    @pos = ((row + 1) * intra_pred_ang)
    510     and         r5,r5,#31                   @fract = pos & (31)
    511     cmp         lr,r5                       @if(fract_prev > fract)
    512     addgt       r10,r10,#1                  @pu1_ref_main_idx += 1
    513     add         r11,r10,#1                  @pu1_ref_main_idx_1 += 1
    514 
    515     vdup.8      d18,r5                      @dup_const_fract
    516     vmull.u8    q8,d14,d13                  @vmull_u8(ref_main_idx, dup_const_32_fract)
    517 
    518     rsb         r4,r5,#32
    519     vdup.8      d19,r4                      @dup_const_32_fract
    520     vmlal.u8    q8,d15,d12                  @vmull_u8(ref_main_idx_1, dup_const_fract)
    521 
    522     vld1.32     {d20[0]},[r10]              @ref_main_idx
    523 
    524     vst1.32     {d10[0]},[r2],r3
    525     vrshrn.i16  d16,q8,#5                   @shift_res = vrshrn_n_u16(add_res, 5)
    526     vld1.32     {d21[0]},[r11]              @ref_main_idx_1
    527 
    528     vmull.u8    q11,d20,d19                 @vmull_u8(ref_main_idx, dup_const_32_fract)
    529     vmlal.u8    q11,d21,d18                 @vmull_u8(ref_main_idx_1, dup_const_fract)
    530 
    531     vst1.32     {d16[0]},[r2],r3
    532     vrshrn.i16  d22,q11,#5                  @shift_res = vrshrn_n_u16(add_res, 5)
    533 
    534     vst1.32     {d22[0]},[r2],r3
    535 
    536 end_loops:
    537     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    538 
    539 
    540 
    541