Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* @file
     21 @*  ihevc_intra_pred_chroma_mode_27_to_33.s
     22 @*
     23 @* @brief
     24 @*  contains function definition for intra prediction  interpolation filters
     25 @*
     26 @*
     27 @* @author
     28 @*  parthiban v
     29 @*
     30 @* @par list of functions:
     31 @*  - ihevc_intra_pred_chroma_mode_27_to_33()
     32 @*
     33 @* @remarksll
     34 @*  none
     35 @*
     36 @*******************************************************************************
     37 @*/
     38 
     39 @/**
     40 @*******************************************************************************
     41 @*
     42 @* @brief
     43 @*  intraprediction for mode 27 to 33  (positive angle, vertical mode ) with
     44 @* neighboring samples location pointed by 'pu1_ref' to the  tu
     45 @* block location pointed by 'pu1_dst'
     46 @*
     47 @* @par description:
     48 @*
     49 @*
     50 @* @param[in] pu1_src
     51 @*  uword8 pointer to the source
     52 @*
     53 @* @param[in] pu1_dst
     54 @*  uword8 pointer to the destination
     55 @*
     56 @* @param[in] src_strd
     57 @*  integer source stride
     58 @*
     59 @* @param[in] dst_strd
     60 @*  integer destination stride
     61 @*
     62 @* @param[in] nt
     63 @*  integer transform block size
     64 @*
     65 @* @param[in] mode
     66 @*  integer intraprediction mode
     67 @*
     68 @* @returns
     69 @*
     70 @* @remarks
     71 @*  none
     72 @*
     73 @*******************************************************************************
     74 @*/
     75 
     76 @.if intra_pred_chroma_27_t0_33 == c
     77 @void ihevc_intra_pred_chroma_mode_27_to_33(uword8 *pu1_ref,
     78 @                                        word32 src_strd,
     79 @                                         uword8 *pu1_dst,
     80 @                                         word32 dst_strd,
     81 @                                         word32 nt,
     82 @                                         word32 mode)
     83 
     84 .equ    nt_offset,          104
     85 .equ    mode_offset,        108
     86 
     87 .text
     88 .align 4
     89 
     90 
     91 
     92 
     93 .globl ihevc_intra_pred_chroma_mode_27_to_33_a9q
     94 .extern gai4_ihevc_ang_table
     95 .extern gau1_ihevc_planar_factor
     96 
     97 gai4_ihevc_ang_table_addr:
     98 .long gai4_ihevc_ang_table - ulbl1 - 8
     99 
    100 gau1_ihevc_planar_factor_addr:
    101 .long gau1_ihevc_planar_factor  - ulbl2 - 8
    102 
    103 
    104 .type ihevc_intra_pred_chroma_mode_27_to_33_a9q, %function
    105 
    106 ihevc_intra_pred_chroma_mode_27_to_33_a9q:
    107 
    108     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    109     vpush        {d8 - d15}
    110 
    111     ldr         r4,[sp,#nt_offset]                 @loads nt
    112     ldr         r5,[sp,#mode_offset]                 @loads mode
    113     ldr         r6,gai4_ihevc_ang_table_addr @loads word32 gai4_ihevc_ang_table[35]
    114 ulbl1:
    115     add         r6,r6,pc
    116 
    117     lsl         r7,r4,#2                    @four_nt
    118 
    119     add         r8,r6,r5,lsl #2             @*gai4_ihevc_ang_table[mode]
    120     ldr         r9,[r8]                     @intra_pred_ang = gai4_ihevc_ang_table[mode]
    121     ldr         r1,gau1_ihevc_planar_factor_addr @used for ((row + 1) * intra_pred_ang) row values
    122 ulbl2:
    123     add         r1,r1,pc
    124     add         r6,r1,#1
    125 
    126     tst         r4,#7
    127     add         r8,r0,r7                    @pu1_ref + four_nt
    128     mov         lr,#0                       @row
    129     mov         r12,r4
    130     bne         core_loop_4
    131     lsl         r4,r4,#1
    132     b           core_loop_8
    133 
    134 core_loop_8:
    135     add         r8,r8,#2                    @pu1_ref_main_idx += (four_nt + 1)
    136     vdup.8      d0,r9                       @intra_pred_ang
    137     mov         r12,r4,lsr #4               @divide by 8
    138 
    139     vmov.i8     d1,#32
    140     mul         r7,r4,r12
    141 
    142     vmov.i16    q3,#31
    143 
    144     mov         r1,r8
    145     mov         r5,r4
    146     mov         r11,#2
    147 
    148 prologue:
    149     vld1.8      {d3},[r6]                   @loads the row value
    150     vmull.u8    q1,d3,d0                    @pos = ((row + 1) * intra_pred_ang)
    151     vand        q2,q1,q3                    @dup_const_fract(fract = pos & (31))
    152     vmovn.i16   d4,q2
    153     vshrn.u16   d5,q1,#5                    @idx = pos >> 5
    154 
    155     vdup.8      d31,d4[0]
    156     add         r0,r2,r3
    157 
    158     vmov.u32    lr,d5[0]                    @(i row)extract idx to the r register
    159     lsl         lr,lr,#1
    160 
    161     vdup.8      d29,d4[1]                   @(ii)
    162     and         r9,lr,#0xff                 @(i row) get the last byte
    163 
    164     add         r10,r8,r9                   @(i row)*pu1_ref[ref_main_idx]
    165 
    166     asr         lr,lr,#8                    @(ii)shift by 8
    167     vld1.8      {d8},[r10],r11              @(i row)ref_main_idx
    168     and         r9,lr,#0xff                 @(ii)get the last byte
    169 
    170     asr         lr,lr,#8                    @(iii)
    171     vld1.8      {d9},[r10]                  @(i row)ref_main_idx_1
    172     add         r12,r8,r9                   @(ii)*pu1_ref[ref_main_idx]
    173 
    174     and         r9,lr,#0xff                 @(iii)
    175     vsub.u8     d30,d1,d31                  @32-fract(dup_const_32_fract)
    176     add         r10,r8,r9                   @(iii)*pu1_ref[ref_main_idx]
    177 
    178     vld1.8      {d12},[r12],r11             @(ii)ref_main_idx
    179     vmull.u8    q5,d8,d30                   @(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
    180 
    181     vld1.8      {d13},[r12]                 @(ii)ref_main_idx_1
    182     vmlal.u8    q5,d9,d31                   @(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
    183     asr         lr,lr,#8                    @(iv)
    184 
    185     vdup.8      d27,d4[2]                   @(iii)
    186     vsub.u8     d28,d1,d29                  @(ii)32-fract(dup_const_32_fract)
    187     and         r9,lr,#0xff                 @(iv)
    188 
    189     vdup.8      d25,d4[3]                   @(iv)
    190     vmull.u8    q7,d12,d28                  @(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
    191     add         r12,r8,r9                   @(iv)*pu1_ref[ref_main_idx]
    192 
    193     vld1.8      {d16},[r10],r11             @(iii)ref_main_idx
    194     vmlal.u8    q7,d13,d29                  @(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
    195 
    196     vld1.8      {d17},[r10]                 @(iii)ref_main_idx_1
    197     vrshrn.i16  d10,q5,#5                   @(i row)shift_res = vrshrn_n_u16(add_res, 5)
    198 
    199     vld1.8      {d20},[r12],r11             @(iv)ref_main_idx
    200     vsub.u8     d26,d1,d27                  @(iii)32-fract(dup_const_32_fract)
    201 
    202     vld1.8      {d21},[r12]                 @(iv)ref_main_idx_1
    203 
    204     vdup.8      d31,d4[4]                   @(v)
    205     vmull.u8    q9,d16,d26                  @(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
    206 
    207     vmov.u32    lr,d5[1]                    @extract idx to the r register
    208     vmlal.u8    q9,d17,d27                  @(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
    209     lsl         lr,lr,#1
    210 
    211     vst1.8      {d10},[r2]!                 @(i row)
    212     vrshrn.i16  d14,q7,#5                   @(ii)shift_res = vrshrn_n_u16(add_res, 5)
    213 
    214     and         r9,lr,#0xff                 @(v)
    215     vdup.8      d29,d4[5]                   @(vi)
    216     add         r10,r8,r9                   @(v)*pu1_ref[ref_main_idx]
    217 
    218     vld1.8      {d8},[r10],r11              @(v)ref_main_idx
    219     vsub.u8     d24,d1,d25                  @(iv)32-fract(dup_const_32_fract)
    220 
    221     asr         lr,lr,#8                    @(vi)
    222     vmull.u8    q11,d20,d24                 @(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
    223     and         r9,lr,#0xff                 @(vi)
    224 
    225     vld1.8      {d9},[r10]                  @(v)ref_main_idx_1
    226     vmlal.u8    q11,d21,d25                 @(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
    227 
    228     vst1.8      {d14},[r0],r3               @(ii)
    229     vrshrn.i16  d18,q9,#5                   @(iii)shift_res = vrshrn_n_u16(add_res, 5)
    230 
    231     add         r12,r8,r9                   @(vi)*pu1_ref[ref_main_idx]
    232     vdup.8      d27,d4[6]                   @(vii)
    233     asr         lr,lr,#8                    @(vii)
    234 
    235     and         r9,lr,#0xff                 @(vii)
    236     vsub.u8     d30,d1,d31                  @(v)32-fract(dup_const_32_fract)
    237     add         r10,r8,r9                   @(vii)*pu1_ref[ref_main_idx]
    238 
    239     vld1.8      {d12},[r12],r11             @(vi)ref_main_idx
    240     vmull.u8    q5,d8,d30                   @(v)vmull_u8(ref_main_idx, dup_const_32_fract)
    241 
    242     vld1.8      {d13},[r12]                 @(vi)ref_main_idx_1
    243     vmlal.u8    q5,d9,d31                   @(v)vmull_u8(ref_main_idx_1, dup_const_fract)
    244 
    245     vst1.8      {d18},[r0],r3               @(iii)
    246     vrshrn.i16  d22,q11,#5                  @(iv)shift_res = vrshrn_n_u16(add_res, 5)
    247 
    248     asr         lr,lr,#8                    @(viii)
    249     vdup.8      d25,d4[7]                   @(viii)
    250     and         r9,lr,#0xff                 @(viii)
    251 
    252     vld1.8      {d16},[r10],r11             @(vii)ref_main_idx
    253     vsub.u8     d28,d1,d29                  @(vi)32-fract(dup_const_32_fract)
    254 
    255     vld1.8      {d17},[r10]                 @(vii)ref_main_idx_1
    256     vmull.u8    q7,d12,d28                  @(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
    257 
    258     add         r12,r8,r9                   @(viii)*pu1_ref[ref_main_idx]
    259     vmlal.u8    q7,d13,d29                  @(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
    260     subs        r7,r7,#8
    261 
    262     vst1.8      {d22},[r0],r3               @(iv)
    263     vrshrn.i16  d10,q5,#5                   @(v)shift_res = vrshrn_n_u16(add_res, 5)
    264 
    265     vld1.8      {d20},[r12],r11             @(viii)ref_main_idx
    266     vsub.u8     d26,d1,d27                  @(vii)32-fract(dup_const_32_fract)
    267 
    268     vld1.8      {d21},[r12]                 @(viii)ref_main_idx_1
    269     vmull.u8    q9,d16,d26                  @(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
    270 
    271     addgt       r8,r8,#8
    272     vmlal.u8    q9,d17,d27                  @(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
    273     subgt       r4,r4,#8
    274 
    275     vst1.8      {d10},[r0],r3               @(v)
    276     vrshrn.i16  d14,q7,#5                   @(vi)shift_res = vrshrn_n_u16(add_res, 5)
    277 
    278     beq         epilogue
    279 
    280     vld1.8      {d5},[r6]                   @loads the row value
    281     vmull.u8    q1,d5,d0                    @pos = ((row + 1) * intra_pred_ang)
    282     vand        q2,q1,q3                    @dup_const_fract(fract = pos & (31))
    283     vmovn.i16   d4,q2
    284     vshrn.u16   d3,q1,#5                    @idx = pos >> 5
    285     vmov.u32    lr,d3[0]                    @(i)extract idx to the r register
    286     lsl         lr,lr,#1
    287     and         r9,lr,#0xff                 @(i)
    288     add         r10,r8,r9                   @(i)*pu1_ref[ref_main_idx]
    289 
    290 kernel_8_rows:
    291     asr         lr,lr,#8                    @(ii)
    292     vdup.8      d31,d4[0]
    293     subs        r4,r4,#8
    294 
    295     vld1.8      {d8},[r10],r11              @(i)ref_main_idx
    296     vsub.u8     d24,d1,d25                  @(viii)32-fract(dup_const_32_fract)
    297     and         r9,lr,#0xff                 @(ii)
    298     addle       r6,r6,#8                    @increment the row value
    299 
    300     vld1.8      {d9},[r10]                  @(i)ref_main_idx_1
    301     vmull.u8    q11,d20,d24                 @(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
    302     add         r12,r8,r9                   @(ii)*pu1_ref[ref_main_idx]
    303 
    304     vld1.8      {d5},[r6]                   @loads the row value
    305     vmlal.u8    q11,d21,d25                 @(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
    306     asr         lr,lr,#8                    @(iii)
    307 
    308     vdup.8      d29,d4[1]                   @(ii)
    309     vrshrn.i16  d18,q9,#5                   @(vii)shift_res = vrshrn_n_u16(add_res, 5)
    310     and         r9,lr,#0xff                 @(iii)
    311 
    312     vst1.8      {d14},[r0],r3               @(vi)
    313     vsub.u8     d30,d1,d31                  @(i)32-fract(dup_const_32_fract)
    314     add         r10,r8,r9                   @(iii)*pu1_ref[ref_main_idx]
    315 
    316     vld1.8      {d12},[r12],r11             @(ii)ref_main_idx
    317     vmull.u8    q5,d8,d30                   @(i)vmull_u8(ref_main_idx, dup_const_32_fract)
    318     asr         lr,lr,#8                    @(iv)
    319 
    320     vld1.8      {d13},[r12]                 @(ii)ref_main_idx_1
    321     vmlal.u8    q5,d9,d31                   @(i)vmull_u8(ref_main_idx_1, dup_const_fract)
    322     and         r9,lr,#0xff                 @(iv)
    323 
    324     vmov.u32    lr,d3[1]                    @extract idx to the r register
    325     vrshrn.i16  d22,q11,#5                  @(viii)shift_res = vrshrn_n_u16(add_res, 5)
    326 
    327     vdup.8      d27,d4[2]                   @(iii)
    328     vsub.u8     d28,d1,d29                  @(ii)32-fract(dup_const_32_fract)
    329     movle       r4,r5                       @reload nt
    330 
    331     vld1.8      {d16},[r10],r11             @(iii)ref_main_idx
    332     vmull.u8    q7,d12,d28                  @(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
    333     add         r12,r8,r9                   @(iv)*pu1_ref[ref_main_idx]
    334 
    335     vst1.8      {d18},[r0],r3               @(vii)
    336     vmlal.u8    q7,d13,d29                  @(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
    337 
    338     vld1.8      {d17},[r10]                 @(iii)ref_main_idx_1
    339     vrshrn.i16  d10,q5,#5                   @(i)shift_res = vrshrn_n_u16(add_res, 5)
    340 
    341     vdup.8      d25,d4[3]                   @(iv)
    342     vmull.u8    q1,d5,d0                    @pos = ((row + 1) * intra_pred_ang)
    343 
    344     vst1.8      {d22},[r0]                  @(viii)
    345     vsub.u8     d26,d1,d27                  @(iii)32-fract(dup_const_32_fract)
    346 
    347     vld1.8      {d20},[r12],r11             @(iv)ref_main_idx
    348     vmull.u8    q9,d16,d26                  @(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
    349     lsl         lr,lr,#1
    350 
    351     vld1.8      {d21},[r12]                 @(iv)ref_main_idx_1
    352     vmlal.u8    q9,d17,d27                  @(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
    353     add         r0,r2,r3
    354 
    355     vdup.8      d31,d4[4]                   @(v)
    356     vrshrn.i16  d14,q7,#5                   @(ii)shift_res = vrshrn_n_u16(add_res, 5)
    357     and         r9,lr,#0xff                 @(v)
    358 
    359     vst1.8      {d10},[r2]!                 @(i)
    360     vsub.u8     d24,d1,d25                  @(iv)32-fract(dup_const_32_fract)
    361     add         r10,r8,r9                   @(v)*pu1_ref[ref_main_idx]
    362 
    363     vdup.8      d29,d4[5]                   @(vi)
    364     vmull.u8    q11,d20,d24                 @(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
    365     asr         lr,lr,#8                    @(vi)
    366 
    367     vdup.8      d27,d4[6]                   @(vii)
    368     vmlal.u8    q11,d21,d25                 @(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
    369     and         r9,lr,#0xff                 @(vi)
    370 
    371     vdup.8      d25,d4[7]                   @(viii)
    372     vrshrn.i16  d18,q9,#5                   @(iii)shift_res = vrshrn_n_u16(add_res, 5)
    373     add         r12,r8,r9                   @(vi)*pu1_ref[ref_main_idx]
    374 
    375     vld1.8      {d8},[r10],r11              @(v)ref_main_idx
    376     vand        q2,q1,q3                    @dup_const_fract(fract = pos & (31))
    377     asr         lr,lr,#8                    @(vii)
    378 
    379     vld1.8      {d9},[r10]                  @(v)ref_main_idx_1
    380     vshrn.u16   d3,q1,#5                    @idx = pos >> 5
    381     and         r9,lr,#0xff                 @(vii)
    382 
    383     vst1.8      {d14},[r0],r3               @(ii)
    384     vrshrn.i16  d22,q11,#5                  @(iv)shift_res = vrshrn_n_u16(add_res, 5)
    385     asr         lr,lr,#8                    @(viii)
    386 
    387     vld1.8      {d12},[r12],r11             @(vi)ref_main_idx
    388     vsub.u8     d30,d1,d31                  @(v)32-fract(dup_const_32_fract)
    389     add         r10,r8,r9                   @(vii)*pu1_ref[ref_main_idx]
    390 
    391     vld1.8      {d13},[r12]                 @(vi)ref_main_idx_1
    392     vmull.u8    q5,d8,d30                   @(v)vmull_u8(ref_main_idx, dup_const_32_fract)
    393     and         r9,lr,#0xff                 @(viii)
    394 
    395     vmov.u32    lr,d3[0]                    @(i)extract idx to the r register
    396     vmlal.u8    q5,d9,d31                   @(v)vmull_u8(ref_main_idx_1, dup_const_fract)
    397     add         r12,r8,r9                   @(viii)*pu1_ref[ref_main_idx]
    398 
    399     vld1.8      {d16},[r10],r11             @(vii)ref_main_idx
    400     vsub.u8     d28,d1,d29                  @(vi)32-fract(dup_const_32_fract)
    401 
    402     vst1.8      {d18},[r0],r3               @(iii)
    403     vmull.u8    q7,d12,d28                  @(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
    404     movle       r8,r1                       @reload the source to pu1_src+2nt
    405 
    406     vld1.8      {d17},[r10]                 @(vii)ref_main_idx_1
    407     vmlal.u8    q7,d13,d29                  @(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
    408     addgt       r8,r8,#8                    @increment the source next set 8 columns in same row
    409 
    410     vld1.8      {d20},[r12],r11             @(viii)ref_main_idx
    411     vrshrn.i16  d10,q5,#5                   @(v)shift_res = vrshrn_n_u16(add_res, 5)
    412 
    413     vld1.8      {d21},[r12]                 @(viii)ref_main_idx_1
    414     vsub.u8     d26,d1,d27                  @(vii)32-fract(dup_const_32_fract)
    415     lslle       r12,r3,#3
    416 
    417     vst1.8      {d22},[r0],r3               @(iv)
    418     vmull.u8    q9,d16,d26                  @(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
    419     suble       r12,r12,r5
    420 
    421     vst1.8      {d10},[r0],r3               @(v)
    422     vmlal.u8    q9,d17,d27                  @(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
    423     addle       r2,r2,r12                   @increment the dst pointer to 8*dst_strd - nt
    424 
    425     vmovn.i16   d4,q2
    426     vrshrn.i16  d14,q7,#5                   @(vi)shift_res = vrshrn_n_u16(add_res, 5)
    427     lsl         lr,lr,#1
    428 
    429     and         r9,lr,#0xff                 @(i)
    430     subs        r7,r7,#8
    431     add         r10,r8,r9                   @(i)*pu1_ref[ref_main_idx]
    432 
    433     bne         kernel_8_rows
    434 
    435 epilogue:
    436     vst1.8      {d14},[r0],r3               @(vi)
    437     vrshrn.i16  d18,q9,#5                   @(vii)shift_res = vrshrn_n_u16(add_res, 5)
    438 
    439     vsub.u8     d24,d1,d25                  @(viii)32-fract(dup_const_32_fract)
    440     vmull.u8    q11,d20,d24                 @(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
    441     vmlal.u8    q11,d21,d25                 @(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
    442 
    443     vst1.8      {d18},[r0],r3               @(vii)
    444     vrshrn.i16  d22,q11,#5                  @(viii)shift_res = vrshrn_n_u16(add_res, 5)
    445 
    446     vst1.8      {d22},[r0],r3               @(viii)
    447     b           end_loops
    448 
    449 core_loop_4:
    450     add         r10,r8,#2                   @pu1_ref_main_idx += (four_nt + 1)
    451     add         r11,r8,#4                   @pu1_ref_main_idx_1 += (four_nt + 2)
    452     mov         r8,#0
    453 
    454     add         r5,r8,#1                    @row + 1
    455     mul         r5,r5,r9                    @pos = ((row + 1) * intra_pred_ang)
    456     and         r5,r5,#31                   @fract = pos & (31)
    457     cmp         lr,r5                       @if(fract_prev > fract)
    458     addgt       r10,r10,#2                  @pu1_ref_main_idx += 2
    459     add         r11,r10,#2                  @pu1_ref_main_idx_1 += 2
    460     vdup.8      d0,r5                       @dup_const_fract
    461     rsb         r4,r5,#32
    462     vdup.8      d1,r4                       @dup_const_32_fract
    463 
    464 @inner_loop_4
    465     vld1.8      {d2},[r10]                  @ref_main_idx
    466     add         r8,r8,#1
    467     mov         lr,r5                       @fract_prev = fract
    468 
    469     vld1.8      {d3},[r11]                  @ref_main_idx_1
    470     add         r5,r8,#1                    @row + 1
    471     mul         r5,r5,r9                    @pos = ((row + 1) * intra_pred_ang)
    472     and         r5,r5,#31                   @fract = pos & (31)
    473     cmp         lr,r5                       @if(fract_prev > fract)
    474     addgt       r10,r10,#2                  @pu1_ref_main_idx += 1
    475     add         r11,r10,#2                  @pu1_ref_main_idx_1 += 1
    476 
    477     vdup.8      d6,r5                       @dup_const_fract
    478     vmull.u8    q2,d2,d1                    @vmull_u8(ref_main_idx, dup_const_32_fract)
    479 
    480     rsb         r4,r5,#32
    481     vdup.8      d7,r4                       @dup_const_32_fract
    482     vmlal.u8    q2,d3,d0                    @vmull_u8(ref_main_idx_1, dup_const_fract)
    483 
    484     vld1.8      {d8},[r10]                  @ref_main_idx
    485     add         r8,r8,#1
    486 
    487     vld1.8      {d9},[r11]                  @ref_main_idx_1
    488     vrshrn.i16  d4,q2,#5                    @shift_res = vrshrn_n_u16(add_res, 5)
    489 
    490     mov         lr,r5                       @fract_prev = fract
    491     add         r5,r8,#1                    @row + 1
    492     mul         r5,r5,r9                    @pos = ((row + 1) * intra_pred_ang)
    493     and         r5,r5,#31                   @fract = pos & (31)
    494     cmp         lr,r5                       @if(fract_prev > fract)
    495     addgt       r10,r10,#2                  @pu1_ref_main_idx += 1
    496     add         r11,r10,#2                  @pu1_ref_main_idx_1 += 1
    497 
    498     vdup.8      d12,r5                      @dup_const_fract
    499     vmull.u8    q5,d8,d7                    @vmull_u8(ref_main_idx, dup_const_32_fract)
    500 
    501     rsb         r4,r5,#32
    502     vdup.8      d13,r4                      @dup_const_32_fract
    503     vmlal.u8    q5,d9,d6                    @vmull_u8(ref_main_idx_1, dup_const_fract)
    504 
    505     vld1.8      {d14},[r10]                 @ref_main_idx
    506     add         r8,r8,#1
    507 
    508     vst1.8      {d4},[r2],r3
    509     vrshrn.i16  d10,q5,#5                   @shift_res = vrshrn_n_u16(add_res, 5)
    510 
    511     vld1.8      {d15},[r11]                 @ref_main_idx_1
    512     mov         lr,r5                       @fract_prev = fract
    513     add         r5,r8,#1                    @row + 1
    514     mul         r5,r5,r9                    @pos = ((row + 1) * intra_pred_ang)
    515     and         r5,r5,#31                   @fract = pos & (31)
    516     cmp         lr,r5                       @if(fract_prev > fract)
    517     addgt       r10,r10,#2                  @pu1_ref_main_idx += 1
    518     add         r11,r10,#2                  @pu1_ref_main_idx_1 += 1
    519 
    520     vdup.8      d18,r5                      @dup_const_fract
    521     vmull.u8    q8,d14,d13                  @vmull_u8(ref_main_idx, dup_const_32_fract)
    522 
    523     rsb         r4,r5,#32
    524     vdup.8      d19,r4                      @dup_const_32_fract
    525     vmlal.u8    q8,d15,d12                  @vmull_u8(ref_main_idx_1, dup_const_fract)
    526 
    527     vld1.8      {d20},[r10]                 @ref_main_idx
    528 
    529     vst1.8      {d10},[r2],r3
    530     vrshrn.i16  d16,q8,#5                   @shift_res = vrshrn_n_u16(add_res, 5)
    531     vld1.8      {d21},[r11]                 @ref_main_idx_1
    532 
    533     vmull.u8    q11,d20,d19                 @vmull_u8(ref_main_idx, dup_const_32_fract)
    534     vmlal.u8    q11,d21,d18                 @vmull_u8(ref_main_idx_1, dup_const_fract)
    535 
    536     vst1.8      {d16},[r2],r3
    537     vrshrn.i16  d22,q11,#5                  @shift_res = vrshrn_n_u16(add_res, 5)
    538 
    539     vst1.8      {d22},[r2],r3
    540 
    541 end_loops:
    542     vpop         {d8 - d15}
    543     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    544 
    545 
    546 
    547 
    548