Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* @file
     21 @*  ihevc_intra_pred_luma_mode_19_to_25.s
     22 @*
     23 @* @brief
     24 @*  contains function definitions for intra prediction dc filtering.
     25 @* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 @* rvct
     28 @*
     29 @* @author
     30 @*  naveen sr
     31 @*
     32 @* @par list of functions:
     33 @*
     34 @*
     35 @* @remarks
     36 @*  none
     37 @*
     38 @*******************************************************************************
     39 @*/
     40 @/**
     41 @*******************************************************************************
     42 @*
     43 @* @brief
     44 @*    luma intraprediction filter for dc input
     45 @*
     46 @* @par description:
     47 @*
     48 @* @param[in] pu1_ref
     49 @*  uword8 pointer to the source
     50 @*
     51 @* @param[out] pu1_dst
     52 @*  uword8 pointer to the destination
     53 @*
     54 @* @param[in] src_strd
     55 @*  integer source stride
     56 @*
     57 @* @param[in] dst_strd
     58 @*  integer destination stride
     59 @*
     60 @* @param[in] nt
     61 @*  size of tranform block
     62 @*
     63 @* @param[in] mode
     64 @*  type of filtering
     65 @*
     66 @* @returns
     67 @*
     68 @* @remarks
     69 @*  none
     70 @*
     71 @*******************************************************************************
     72 @*/
     73 
     74 @void ihevc_intra_pred_luma_mode_19_to_25(uword8* pu1_ref,
     75 @                               word32 src_strd,
     76 @                               uword8* pu1_dst,
     77 @                               word32 dst_strd,
     78 @                               word32 nt,
     79 @                               word32 mode)
     80 @
     81 @**************variables vs registers*****************************************
     82 @r0 => *pu1_ref
     83 @r1 => src_strd
     84 @r2 => *pu1_dst
     85 @r3 => dst_strd
     86 
     87 @stack contents from #236
     88 @   nt
     89 @   mode
     90 
     91 .equ    nt_offset,      236
     92 .equ    mode_offset,    240
     93 
     94 .text
     95 .align 4
     96 
     97 
     98 
     99 
    100 .globl ihevc_intra_pred_luma_mode_19_to_25_a9q
    101 .extern gai4_ihevc_ang_table
    102 .extern gai4_ihevc_inv_ang_table
    103 .extern gau1_ihevc_planar_factor
    104 
    105 gai4_ihevc_inv_ang_table_addr:
    106 .long gai4_ihevc_inv_ang_table - ulbl1 - 8
    107 
    108 gau1_ihevc_planar_factor_addr:
    109 .long gau1_ihevc_planar_factor - ulbl2 - 8
    110 
    111 gai4_ihevc_ang_table_addr_1:
    112 .long gai4_ihevc_ang_table - ulbl_1 - 8
    113 
    114 gai4_ihevc_ang_table_addr_2:
    115 .long gai4_ihevc_ang_table - ulbl_2 - 8
    116 
    117 .type ihevc_intra_pred_luma_mode_19_to_25_a9q, %function
    118 
    119 ihevc_intra_pred_luma_mode_19_to_25_a9q:
    120 
    121     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    122     vpush       {d8 - d15}
    123     sub         sp, sp, #132                @ref_temp[2 * max_cu_size + 1]
    124 
    125     ldr         r4,[sp,#nt_offset]          @loads nt
    126     ldr         r7, gai4_ihevc_ang_table_addr_1
    127 ulbl_1:
    128     add         r7,r7,pc
    129 
    130     ldr         r5,[sp,#mode_offset]        @mode (19 to 25)
    131     ldr         r8, gai4_ihevc_inv_ang_table_addr
    132 ulbl1:
    133     add         r8,r8,pc
    134 
    135     add         r7, r7, r5, lsl #2          @gai4_ihevc_ang_table[mode]
    136     add         r8, r8, r5, lsl #2          @gai4_ihevc_inv_ang_table
    137     sub         r8, r8, #48                 @gai4_ihevc_inv_ang_table[mode - 12]
    138 
    139     ldr         r7, [r7]                    @intra_pred_ang
    140 
    141     ldr         r8, [r8]                    @inv_ang
    142     add         r6, sp, r4                  @ref_temp + nt
    143 
    144     mul         r9, r4, r7                  @nt*intra_pred_ang
    145 
    146     sub         r6, r6, #1                  @ref_temp + nt - 1
    147 
    148     add         r1, r0, r4, lsl #1          @r1 = &src[2nt]
    149     vdup.8      d30, r7                     @intra_pred_ang
    150 
    151     mov         r7, r4
    152 
    153     asr         r9, r9, #5
    154 
    155     vld1.32     d0[0],[r1]!                 @ pu1_ref[two_nt + k]
    156 
    157     vst1.32     d0[0],[r6]!                 @ref_temp[k + nt - 1] = pu1_ref[two_nt + k]@
    158 
    159     subs        r7, r7, #4
    160     beq         end_loop_copy
    161     sub         r1,#4
    162     sub         r6,#4
    163     subs        r7,r7,#4
    164     beq         loop_copy_8
    165     subs        r7,r7,#8
    166     beq         loop_copy_16
    167 
    168 loop_copy_32:
    169     vld1.8      d0,[r1]!
    170     vld1.8      d1,[r1]!
    171     vld1.8      d2,[r1]!
    172     vld1.8      d3,[r1]!
    173 
    174     vst1.8      d0,[r6]!
    175     vst1.8      d1,[r6]!
    176     vst1.8      d2,[r6]!
    177     vst1.8      d3,[r6]!
    178     b           end_loop_copy
    179 
    180 loop_copy_16:
    181     vld1.8      d0,[r1]!
    182     vld1.8      d1,[r1]!
    183 
    184     vst1.8      d0,[r6]!
    185     vst1.8      d1,[r6]!
    186     b           end_loop_copy
    187 
    188 loop_copy_8:
    189     vld1.8      d0,[r1]!
    190     vst1.8      d0,[r6]!
    191 
    192 end_loop_copy:
    193 
    194     ldrb        r11, [r1]
    195     strb        r11, [r6]
    196 
    197     cmp         r9, #-1
    198     bge         linear_filtering
    199 
    200     add         r6, sp, r4                  @ref_temp + nt
    201     sub         r6, r6, #2                  @ref_temp + nt - 2
    202 
    203     mov         r12, #0xffffffff
    204 
    205     rsb         r9, r9, r12                 @count to take care off ref_idx
    206 
    207     add         r1, r0, r4, lsl #1          @r1 = &src[2nt]
    208 
    209     mov         r7, #128                    @inv_ang_sum
    210 
    211 loop_copy_ref_idx:
    212 
    213     add         r7, r7, r8                  @inv_ang_sum += inv_ang
    214     mov         r14,r7,lsr #8
    215     ldrb        r11, [r1, -r14]
    216 @   ldrb        r11, [r1, -r7, lsr #8]
    217     strb        r11, [r6], #-1
    218 
    219     subs        r9, r9, #1
    220 
    221     bne         loop_copy_ref_idx
    222 
    223 
    224 linear_filtering:
    225 @   after copy
    226 @   below code is taken from mode 27 to 33 and modified
    227 
    228     ldr         r6,gai4_ihevc_ang_table_addr_2 @loads word32 gai4_ihevc_ang_table[35]
    229 ulbl_2:
    230     add         r6,r6,pc
    231 
    232     add         r8,r6,r5,lsl #2             @*gai4_ihevc_ang_table[mode]
    233     ldr         r9,[r8]                     @intra_pred_ang = gai4_ihevc_ang_table[mode]
    234     ldr         r1,gau1_ihevc_planar_factor_addr @used for ((row + 1) * intra_pred_ang) row values
    235 ulbl2:
    236     add         r1,r1,pc
    237     add         r6,r1,#1
    238 
    239     add         r8, sp, r4                  @ref_temp + nt
    240     sub         r8,#1                       @ref_temp + nt -1
    241 
    242     tst         r4,#7
    243     mov         lr,#0                       @row
    244     mov         r12,r4
    245     bne         core_loop_4
    246 
    247 core_loop_8:
    248     add         r8,r8,#1                    @pu1_ref_main_idx += (two_nt + 1)
    249     vdup.8      d0,r9                       @intra_pred_ang
    250     mov         r12,r4,lsr #3               @divide by 8
    251 
    252     vmov.i8     d1,#32
    253     mul         r7,r4,r12
    254 
    255     vmov.i16    q3,#31
    256     @lsl            r12,r3,#3
    257 
    258     mov         r1,r8
    259     @sub            r12,r12,r4
    260     mov         r5,r4
    261     mov         r11,#1
    262 
    263 prologue:
    264     vld1.8      {d3},[r6]                   @loads the row value
    265     vmull.s8    q1,d3,d0                    @pos = ((row + 1) * intra_pred_ang)
    266     vand        q2,q1,q3                    @dup_const_fract(fract = pos & (31))
    267     vmovn.i16   d4,q2
    268     vshrn.s16   d5,q1,#5                    @idx = pos >> 5
    269 
    270     vdup.8      d31,d4[0]
    271     add         r0,r2,r3
    272 
    273     vmov.u32    lr,d5[0]                    @(i row)extract idx to the r register
    274 
    275     vdup.8      d29,d4[1]                   @(ii)
    276     sbfx        r9,lr,#0,#8
    277 
    278     add         r10,r8,r9                   @(i row)*pu1_ref[ref_main_idx]
    279 
    280     vld1.8      {d8},[r10],r11              @(i row)ref_main_idx
    281     sbfx        r9,lr,#8,#8
    282 
    283     vld1.8      {d9},[r10]                  @(i row)ref_main_idx_1
    284     add         r12,r8,r9                   @(ii)*pu1_ref[ref_main_idx]
    285 
    286     sbfx        r9,lr,#16,#8
    287     vsub.u8     d30,d1,d31                  @32-fract(dup_const_32_fract)
    288     add         r10,r8,r9                   @(iii)*pu1_ref[ref_main_idx]
    289 
    290     vld1.8      {d12},[r12],r11             @(ii)ref_main_idx
    291     vmull.u8    q5,d8,d30                   @(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
    292 
    293     vld1.8      {d13},[r12]                 @(ii)ref_main_idx_1
    294     vmlal.u8    q5,d9,d31                   @(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
    295 
    296     vdup.8      d27,d4[2]                   @(iii)
    297     vsub.u8     d28,d1,d29                  @(ii)32-fract(dup_const_32_fract)
    298     sbfx        r9,lr,#24,#8
    299 
    300     vdup.8      d25,d4[3]                   @(iv)
    301     vmull.u8    q7,d12,d28                  @(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
    302     add         r12,r8,r9                   @(iv)*pu1_ref[ref_main_idx]
    303 
    304     vld1.8      {d16},[r10],r11             @(iii)ref_main_idx
    305     vmlal.u8    q7,d13,d29                  @(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
    306 
    307     vld1.8      {d17},[r10]                 @(iii)ref_main_idx_1
    308     vrshrn.i16  d10,q5,#5                   @(i row)shift_res = vrshrn_n_u16(add_res, 5)
    309 
    310     vld1.8      {d20},[r12],r11             @(iv)ref_main_idx
    311     vsub.u8     d26,d1,d27                  @(iii)32-fract(dup_const_32_fract)
    312 
    313     vld1.8      {d21},[r12]                 @(iv)ref_main_idx_1
    314 
    315     vdup.8      d31,d4[4]                   @(v)
    316     vmull.u8    q9,d16,d26                  @(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
    317 
    318     vmov.u32    lr,d5[1]                    @extract idx to the r register
    319     vmlal.u8    q9,d17,d27                  @(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
    320 
    321     vst1.8      {d10},[r2]!                 @(i row)
    322     vrshrn.i16  d14,q7,#5                   @(ii)shift_res = vrshrn_n_u16(add_res, 5)
    323 
    324     sbfx        r9,lr,#0,#8
    325     vdup.8      d29,d4[5]                   @(vi)
    326     add         r10,r8,r9                   @(v)*pu1_ref[ref_main_idx]
    327 
    328     vld1.8      {d8},[r10],r11              @(v)ref_main_idx
    329     vsub.u8     d24,d1,d25                  @(iv)32-fract(dup_const_32_fract)
    330 
    331     vmull.u8    q11,d20,d24                 @(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
    332     sbfx        r9,lr,#8,#8
    333 
    334     vld1.8      {d9},[r10]                  @(v)ref_main_idx_1
    335     vmlal.u8    q11,d21,d25                 @(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
    336 
    337     vst1.8      {d14},[r0],r3               @(ii)
    338     vrshrn.i16  d18,q9,#5                   @(iii)shift_res = vrshrn_n_u16(add_res, 5)
    339 
    340     add         r12,r8,r9                   @(vi)*pu1_ref[ref_main_idx]
    341     vdup.8      d27,d4[6]                   @(vii)
    342 
    343     sbfx        r9,lr,#16,#8
    344     vsub.u8     d30,d1,d31                  @(v)32-fract(dup_const_32_fract)
    345     add         r10,r8,r9                   @(vii)*pu1_ref[ref_main_idx]
    346 
    347     vld1.8      {d12},[r12],r11             @(vi)ref_main_idx
    348     vmull.u8    q5,d8,d30                   @(v)vmull_u8(ref_main_idx, dup_const_32_fract)
    349 
    350     vld1.8      {d13},[r12]                 @(vi)ref_main_idx_1
    351     vmlal.u8    q5,d9,d31                   @(v)vmull_u8(ref_main_idx_1, dup_const_fract)
    352 
    353     vst1.8      {d18},[r0],r3               @(iii)
    354     vrshrn.i16  d22,q11,#5                  @(iv)shift_res = vrshrn_n_u16(add_res, 5)
    355 
    356     vdup.8      d25,d4[7]                   @(viii)
    357     sbfx        r9,lr,#24,#8
    358 
    359     vld1.8      {d16},[r10],r11             @(vii)ref_main_idx
    360     vsub.u8     d28,d1,d29                  @(vi)32-fract(dup_const_32_fract)
    361 
    362     vld1.8      {d17},[r10]                 @(vii)ref_main_idx_1
    363     vmull.u8    q7,d12,d28                  @(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
    364 
    365     add         r12,r8,r9                   @(viii)*pu1_ref[ref_main_idx]
    366     vmlal.u8    q7,d13,d29                  @(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
    367     subs        r4,r4,#8
    368 
    369     vst1.8      {d22},[r0],r3               @(iv)
    370     vrshrn.i16  d10,q5,#5                   @(v)shift_res = vrshrn_n_u16(add_res, 5)
    371 
    372     vld1.8      {d20},[r12],r11             @(viii)ref_main_idx
    373     vsub.u8     d26,d1,d27                  @(vii)32-fract(dup_const_32_fract)
    374 
    375     vld1.8      {d21},[r12]                 @(viii)ref_main_idx_1
    376     vmull.u8    q9,d16,d26                  @(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
    377 
    378     addgt       r8,r8,#8
    379     vmlal.u8    q9,d17,d27                  @(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
    380     subgt       r7,r7,#8
    381 
    382     vst1.8      {d10},[r0],r3               @(v)
    383     vrshrn.i16  d14,q7,#5                   @(vi)shift_res = vrshrn_n_u16(add_res, 5)
    384 
    385     beq         epilogue
    386 
    387     vld1.8      {d5},[r6]                   @loads the row value
    388     vmull.s8    q1,d5,d0                    @pos = ((row + 1) * intra_pred_ang)
    389     vand        q2,q1,q3                    @dup_const_fract(fract = pos & (31))
    390     vmovn.i16   d4,q2
    391     vshrn.s16   d3,q1,#5                    @idx = pos >> 5
    392     vmov.u32    lr,d3[0]                    @(i)extract idx to the r register
    393     sbfx        r9,lr,#0,#8
    394     add         r10,r8,r9                   @(i)*pu1_ref[ref_main_idx]
    395 
    396 kernel_8_rows:
    397     vdup.8      d31,d4[0]
    398     subs        r4,r4,#8
    399     sbfx        r9,lr,#8,#8
    400 
    401     vld1.8      {d8},[r10],r11              @(i)ref_main_idx
    402     vsub.u8     d24,d1,d25                  @(viii)32-fract(dup_const_32_fract)
    403 
    404     addle       r6,r6,#8                    @increment the row value
    405     add         r12,r8,r9                   @(ii)*pu1_ref[ref_main_idx]
    406 
    407     vld1.8      {d9},[r10]                  @(i)ref_main_idx_1
    408     vmull.u8    q11,d20,d24                 @(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
    409 
    410     vld1.8      {d5},[r6]                   @loads the row value
    411     vmlal.u8    q11,d21,d25                 @(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
    412 
    413     vdup.8      d29,d4[1]                   @(ii)
    414     vrshrn.i16  d18,q9,#5                   @(vii)shift_res = vrshrn_n_u16(add_res, 5)
    415 
    416     sbfx        r9,lr,#16,#8
    417 
    418     vst1.8      {d14},[r0],r3               @(vi)
    419     vsub.u8     d30,d1,d31                  @(i)32-fract(dup_const_32_fract)
    420 
    421     add         r10,r8,r9                   @(iii)*pu1_ref[ref_main_idx]
    422 
    423     vld1.8      {d12},[r12],r11             @(ii)ref_main_idx
    424     vmull.u8    q5,d8,d30                   @(i)vmull_u8(ref_main_idx, dup_const_32_fract)
    425 
    426     vld1.8      {d13},[r12]                 @(ii)ref_main_idx_1
    427     vmlal.u8    q5,d9,d31                   @(i)vmull_u8(ref_main_idx_1, dup_const_fract)
    428 
    429     sbfx        r9,lr,#24,#8
    430     movle       r4,r5                       @reload nt
    431 
    432     vmov.u32    lr,d3[1]                    @extract idx to the r register
    433     vrshrn.i16  d22,q11,#5                  @(viii)shift_res = vrshrn_n_u16(add_res, 5)
    434 
    435     vdup.8      d27,d4[2]                   @(iii)
    436     vsub.u8     d28,d1,d29                  @(ii)32-fract(dup_const_32_fract)
    437     add         r12,r8,r9                   @(iv)*pu1_ref[ref_main_idx]
    438 
    439     vld1.8      {d16},[r10],r11             @(iii)ref_main_idx
    440     vmull.u8    q7,d12,d28                  @(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
    441 
    442     vst1.8      {d18},[r0],r3               @(vii)
    443     vmlal.u8    q7,d13,d29                  @(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
    444 
    445     vld1.8      {d17},[r10]                 @(iii)ref_main_idx_1
    446     vrshrn.i16  d10,q5,#5                   @(i)shift_res = vrshrn_n_u16(add_res, 5)
    447 
    448     vdup.8      d25,d4[3]                   @(iv)
    449     vmull.s8    q1,d5,d0                    @pos = ((row + 1) * intra_pred_ang)
    450 
    451     vst1.8      {d22},[r0]                  @(viii)
    452     vsub.u8     d26,d1,d27                  @(iii)32-fract(dup_const_32_fract)
    453 
    454     vld1.8      {d20},[r12],r11             @(iv)ref_main_idx
    455     vmull.u8    q9,d16,d26                  @(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
    456 
    457     vld1.8      {d21},[r12]                 @(iv)ref_main_idx_1
    458     vmlal.u8    q9,d17,d27                  @(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
    459 
    460     sbfx        r9,lr,#0,#8
    461     add         r0,r2,r3
    462 
    463     vdup.8      d31,d4[4]                   @(v)
    464     vrshrn.i16  d14,q7,#5                   @(ii)shift_res = vrshrn_n_u16(add_res, 5)
    465 
    466     add         r10,r8,r9                   @(v)*pu1_ref[ref_main_idx]
    467     sbfx        r9,lr,#8,#8
    468 
    469     vst1.8      {d10},[r2]!                 @(i)
    470     vsub.u8     d24,d1,d25                  @(iv)32-fract(dup_const_32_fract)
    471 
    472     vdup.8      d29,d4[5]                   @(vi)
    473     vmull.u8    q11,d20,d24                 @(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
    474 
    475     vdup.8      d27,d4[6]                   @(vii)
    476     vmlal.u8    q11,d21,d25                 @(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
    477 
    478     add         r12,r8,r9                   @(vi)*pu1_ref[ref_main_idx]
    479     sbfx        r9,lr,#16,#8
    480 
    481     vdup.8      d25,d4[7]                   @(viii)
    482     vrshrn.i16  d18,q9,#5                   @(iii)shift_res = vrshrn_n_u16(add_res, 5)
    483 
    484     vld1.8      {d8},[r10],r11              @(v)ref_main_idx
    485     vand        q2,q1,q3                    @dup_const_fract(fract = pos & (31))
    486 
    487     vld1.8      {d9},[r10]                  @(v)ref_main_idx_1
    488     vshrn.s16   d3,q1,#5                    @idx = pos >> 5
    489 
    490     vst1.8      {d14},[r0],r3               @(ii)
    491     vrshrn.i16  d22,q11,#5                  @(iv)shift_res = vrshrn_n_u16(add_res, 5)
    492 
    493     add         r10,r8,r9                   @(vii)*pu1_ref[ref_main_idx]
    494     sbfx        r9,lr,#24,#8
    495 
    496     vld1.8      {d12},[r12],r11             @(vi)ref_main_idx
    497     vsub.u8     d30,d1,d31                  @(v)32-fract(dup_const_32_fract)
    498 
    499     vld1.8      {d13},[r12]                 @(vi)ref_main_idx_1
    500     vmull.u8    q5,d8,d30                   @(v)vmull_u8(ref_main_idx, dup_const_32_fract)
    501 
    502     vmov.u32    lr,d3[0]                    @(i)extract idx to the r register
    503     vmlal.u8    q5,d9,d31                   @(v)vmull_u8(ref_main_idx_1, dup_const_fract)
    504 
    505     add         r12,r8,r9                   @(viii)*pu1_ref[ref_main_idx]
    506     movle       r8,r1                       @reload the source to pu1_src+2nt
    507 
    508     vld1.8      {d16},[r10],r11             @(vii)ref_main_idx
    509     vsub.u8     d28,d1,d29                  @(vi)32-fract(dup_const_32_fract)
    510 
    511     vst1.8      {d18},[r0],r3               @(iii)
    512     vmull.u8    q7,d12,d28                  @(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
    513 
    514     vld1.8      {d17},[r10]                 @(vii)ref_main_idx_1
    515     vmlal.u8    q7,d13,d29                  @(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
    516 
    517     vld1.8      {d20},[r12],r11             @(viii)ref_main_idx
    518     vrshrn.i16  d10,q5,#5                   @(v)shift_res = vrshrn_n_u16(add_res, 5)
    519 
    520     vld1.8      {d21},[r12]                 @(viii)ref_main_idx_1
    521     vsub.u8     d26,d1,d27                  @(vii)32-fract(dup_const_32_fract)
    522 
    523     addgt       r8,r8,#8                    @increment the source next set 8 columns in same row
    524     lslle       r12,r3,#3
    525     suble       r12,r12,r5
    526 
    527     vst1.8      {d22},[r0],r3               @(iv)
    528     vmull.u8    q9,d16,d26                  @(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
    529 
    530     vst1.8      {d10},[r0],r3               @(v)
    531     vmlal.u8    q9,d17,d27                  @(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
    532 
    533     addle       r2,r2,r12                   @increment the dst pointer to 8*dst_strd - nt
    534     sbfx        r9,lr,#0,#8
    535 
    536     vmovn.i16   d4,q2
    537     vrshrn.i16  d14,q7,#5                   @(vi)shift_res = vrshrn_n_u16(add_res, 5)
    538 
    539     subs        r7,r7,#8
    540     add         r10,r8,r9                   @(i)*pu1_ref[ref_main_idx]
    541 
    542     bne         kernel_8_rows
    543 
    544 epilogue:
    545     vst1.8      {d14},[r0],r3               @(vi)
    546     vrshrn.i16  d18,q9,#5                   @(vii)shift_res = vrshrn_n_u16(add_res, 5)
    547 
    548     vsub.u8     d24,d1,d25                  @(viii)32-fract(dup_const_32_fract)
    549     vmull.u8    q11,d20,d24                 @(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
    550     vmlal.u8    q11,d21,d25                 @(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
    551 
    552     vst1.8      {d18},[r0],r3               @(vii)
    553     vrshrn.i16  d22,q11,#5                  @(viii)shift_res = vrshrn_n_u16(add_res, 5)
    554 
    555     vst1.8      {d22},[r0],r3               @(viii)
    556     b           end_loops
    557 
    558 core_loop_4:
    559     add         r6,r8,#1                    @pu1_ref_main_idx +=  1
    560     mov         r8,#0
    561 
    562     add         r5,r8,#1                    @row + 1
    563     mul         r5,r5,r9                    @pos = ((row + 1) * intra_pred_ang)
    564     mov         lr,r5,asr #5                @if(fract_prev > fract)
    565     and         r5,r5,#31                   @fract = pos & (31)
    566     add         r10,r6,lr                   @pu1_ref_main_idx += 1
    567     add         r11,r10,#1                  @pu1_ref_main_idx_1 += 1
    568     vdup.8      d0,r5                       @dup_const_fract
    569     rsb         r4,r5,#32
    570     vdup.8      d1,r4                       @dup_const_32_fract
    571 
    572 @inner_loop_4
    573     vld1.32     {d2[0]},[r10]               @ref_main_idx
    574     add         r8,r8,#1
    575 @   mov         lr,r5                           @fract_prev = fract
    576 
    577     vld1.32     {d3[0]},[r11]               @ref_main_idx_1
    578     add         r5,r8,#1                    @row + 1
    579     mul         r5,r5,r9                    @pos = ((row + 1) * intra_pred_ang)
    580     mov         lr,r5,asr #5                @ pos >> 5
    581     and         r5,r5,#31                   @fract = pos & (31)
    582     add         r10,r6,lr                   @pu1_ref_main_idx += 1
    583     add         r11,r10,#1                  @pu1_ref_main_idx_1 += 1
    584 
    585     vdup.8      d6,r5                       @dup_const_fract
    586     vmull.u8    q2,d2,d1                    @vmull_u8(ref_main_idx, dup_const_32_fract)
    587 
    588     rsb         r4,r5,#32
    589     vdup.8      d7,r4                       @dup_const_32_fract
    590     vmlal.u8    q2,d3,d0                    @vmull_u8(ref_main_idx_1, dup_const_fract)
    591 
    592     vld1.32     {d8[0]},[r10]               @ref_main_idx
    593     add         r8,r8,#1
    594 
    595     vld1.32     {d9[0]},[r11]               @ref_main_idx_1
    596     vrshrn.i16  d4,q2,#5                    @shift_res = vrshrn_n_u16(add_res, 5)
    597 
    598 @   mov         lr,r5                           @fract_prev = fract
    599     add         r5,r8,#1                    @row + 1
    600     mul         r5,r5,r9                    @pos = ((row + 1) * intra_pred_ang)
    601     mov         lr,r5,asr #5                @if(fract_prev > fract)
    602     and         r5,r5,#31                   @fract = pos & (31)
    603     add         r10,r6,lr                   @ref_main + idx
    604     add         r11,r10,#1                  @pu1_ref_main_idx_1 += 1
    605 
    606     vdup.8      d12,r5                      @dup_const_fract
    607     vmull.u8    q5,d8,d7                    @vmull_u8(ref_main_idx, dup_const_32_fract)
    608 
    609     rsb         r4,r5,#32
    610     vdup.8      d13,r4                      @dup_const_32_fract
    611     vmlal.u8    q5,d9,d6                    @vmull_u8(ref_main_idx_1, dup_const_fract)
    612 
    613     vld1.32     {d14[0]},[r10]              @ref_main_idx
    614     add         r8,r8,#1
    615 
    616     vst1.32     {d4[0]},[r2],r3
    617     vrshrn.i16  d10,q5,#5                   @shift_res = vrshrn_n_u16(add_res, 5)
    618 
    619     vld1.32     {d15[0]},[r11]              @ref_main_idx_1
    620 @   mov         lr,r5                           @fract_prev = fract
    621     add         r5,r8,#1                    @row + 1
    622     mul         r5,r5,r9                    @pos = ((row + 1) * intra_pred_ang)
    623     mov         lr,r5,asr #5                @if(fract_prev > fract)
    624     and         r5,r5,#31                   @fract = pos & (31)
    625     add         r10,r6,lr                   @pu1_ref_main_idx += 1
    626     add         r11,r10,#1                  @pu1_ref_main_idx_1 += 1
    627 
    628     vdup.8      d18,r5                      @dup_const_fract
    629     vmull.u8    q8,d14,d13                  @vmull_u8(ref_main_idx, dup_const_32_fract)
    630 
    631     rsb         r4,r5,#32
    632     vdup.8      d19,r4                      @dup_const_32_fract
    633     vmlal.u8    q8,d15,d12                  @vmull_u8(ref_main_idx_1, dup_const_fract)
    634 
    635     vld1.32     {d20[0]},[r10]              @ref_main_idx
    636 
    637     vst1.32     {d10[0]},[r2],r3
    638     vrshrn.i16  d16,q8,#5                   @shift_res = vrshrn_n_u16(add_res, 5)
    639     vld1.32     {d21[0]},[r11]              @ref_main_idx_1
    640 
    641     vmull.u8    q11,d20,d19                 @vmull_u8(ref_main_idx, dup_const_32_fract)
    642     vmlal.u8    q11,d21,d18                 @vmull_u8(ref_main_idx_1, dup_const_fract)
    643 
    644     vst1.32     {d16[0]},[r2],r3
    645     vrshrn.i16  d22,q11,#5                  @shift_res = vrshrn_n_u16(add_res, 5)
    646 
    647     vst1.32     {d22[0]},[r2],r3
    648 
    649 end_loops:
    650     add         sp, sp, #132
    651     vpop        {d8 - d15}
    652     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    653 
    654 
    655 
    656 
    657 
    658 
    659