Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* @file
     21 @*  ihevc_intra_pred_chroma_mode_11_to_17.s
     22 @*
     23 @* @brief
     24 @*  contains function definitions for intra prediction chroma mode 11 to 17
     25 @* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 @* rvct
     28 @*
     29 @* @author
     30 @*  akshaya mukund
     31 @*
     32 @* @par list of functions:
     33 @*
     34 @*
     35 @* @remarks
     36 @*  none
     37 @*
     38 @*******************************************************************************
     39 @*/
     40 @/**
     41 @*******************************************************************************
     42 @*
     43 @* @brief
     44 @*    luma intraprediction filter for dc input
     45 @*
     46 @* @par description:
     47 @*
     48 @* @param[in] pu1_ref
     49 @*  uword8 pointer to the source
     50 @*
     51 @* @param[out] pu1_dst
     52 @*  uword8 pointer to the destination
     53 @*
     54 @* @param[in] src_strd
     55 @*  integer source stride
     56 @*
     57 @* @param[in] dst_strd
     58 @*  integer destination stride
     59 @*
     60 @* @param[in] nt
     61 @*  size of tranform block
     62 @*
     63 @* @param[in] mode
     64 @*  type of filtering
     65 @*
     66 @* @returns
     67 @*
     68 @* @remarks
     69 @*  none
     70 @*
     71 @*******************************************************************************
     72 @*/
     73 
     74 @void ihevc_intra_pred_chroma_mode_11_to_17(uword8* pu1_ref,
     75 @                               word32 src_strd,
     76 @                               uword8* pu1_dst,
     77 @                               word32 dst_strd,
     78 @                               word32 nt,
     79 @                               word32 mode)
     80 @
     81 @**************variables vs registers*****************************************
     82 @r0 => *pu1_ref
     83 @r1 => src_strd
     84 @r2 => *pu1_dst
     85 @r3 => dst_strd
     86 
     87 @stack contents from #40
     88 @   nt
     89 @   mode
     90 
     91 .text
     92 .align 4
     93 
     94 
     95 
     96 
     97 .globl ihevc_intra_pred_chroma_mode_11_to_17_a9q
     98 .extern gai4_ihevc_ang_table
     99 .extern gai4_ihevc_inv_ang_table
    100 .extern col_for_intra_chroma
    101 .extern idx_neg_idx_chroma_11_17
    102 
    103 gai4_ihevc_ang_table_addr:
    104 .long gai4_ihevc_ang_table - ulbl1 - 8
    105 
    106 gai4_ihevc_inv_ang_table_addr:
    107 .long gai4_ihevc_inv_ang_table - ulbl2 - 8
    108 
    109 idx_neg_idx_chroma_11_17_addr:
    110 .long idx_neg_idx_chroma_11_17 - ulbl3 - 8
    111 
    112 col_for_intra_chroma_addr_1:
    113 .long col_for_intra_chroma - ulbl4 - 8
    114 
    115 col_for_intra_chroma_addr_2:
    116 .long col_for_intra_chroma - ulbl5 - 8
    117 
    118 col_for_intra_chroma_addr_3:
    119 .long col_for_intra_chroma - ulbl6 - 8
    120 
    121 .type ihevc_intra_pred_chroma_mode_11_to_17_a9q, %function
    122 
    123 ihevc_intra_pred_chroma_mode_11_to_17_a9q:
    124 
    125     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    126 
    127     ldr         r4,[sp,#40]                 @loads nt
    128     ldr         r7, gai4_ihevc_ang_table_addr
    129 ulbl1:
    130     add         r7,r7,pc
    131 
    132     ldr         r5,[sp,#44]                 @mode (11 to 17)
    133     ldr         r8, gai4_ihevc_inv_ang_table_addr
    134 ulbl2:
    135     add         r8,r8,pc
    136 
    137     add         r7, r7, r5, lsl #2          @gai4_ihevc_ang_table[mode]
    138     add         r8, r8, r5, lsl #2          @gai4_ihevc_inv_ang_table[mode - 11]
    139     sub         r8, r8, #44
    140 
    141     ldr         r7, [r7]                    @intra_pred_ang
    142     sub         sp, sp, #132                @ref_temp[2 * max_cu_size + 2]
    143 
    144     ldr         r8, [r8]                    @inv_ang
    145     add         r6, sp, r4, lsl #1          @ref_temp + 2 * nt
    146 
    147     mul         r9, r4, r7                  @nt*intra_pred_ang
    148 
    149     sub         r6, r6, #2                  @ref_temp + 2*nt - 2
    150 
    151     add         r1, r0, r4, lsl #2          @r1 = &src[4nt]
    152     vdup.8      d30, r7                     @intra_pred_ang
    153 
    154     mov         r7, r4
    155 
    156     sub         r1,r1,#6                    @address calculation for copying 4 halfwords
    157 
    158     asr         r9, r9, #5
    159 
    160     vld1.8      d0,[r1]
    161     vrev64.16   d0,d0
    162     vst1.8      d0,[r6]!
    163 
    164     sub         r1,#8
    165 
    166     subs        r7, r7, #4
    167     addeq       r1,#8
    168     beq         end_loop_copy
    169     subs        r7,r7,#4
    170     beq         loop_copy_8
    171     subs        r7,r7,#8
    172     beq         loop_copy_16
    173 
    174 loop_copy_32:
    175     sub         r1,#24
    176     vld1.8      {d0,d1,d2,d3},[r1]
    177 
    178     sub         r1,#24
    179     vld1.8      {d0,d1,d2,d3},[r1]!
    180 
    181     vrev64.16   d6,d6
    182     vrev64.16   d5,d5
    183     vrev64.16   d4,d4
    184     vrev64.16   d3,d3
    185     vrev64.16   d2,d2
    186     vrev64.16   d1,d1
    187     vrev64.16   d0,d0
    188 
    189     vst1.8      d6,[r6]!
    190     vst1.8      d5,[r6]!
    191     vst1.8      d4,[r6]!
    192     vst1.8      d3,[r6]!
    193     vst1.8      d2,[r6]!
    194     vst1.8      d1,[r6]!
    195     vst1.8      d0,[r6]!
    196 
    197     vld1.8      {d4,d5,d6},[r1]!
    198     b           end_loop_copy
    199 
    200 loop_copy_16:
    201     sub         r1,#16
    202     vld1.8      {d0,d1,d2},[r1]
    203 
    204     vrev64.16   d2,d2
    205     vrev64.16   d1,d1
    206     vrev64.16   d0,d0
    207 
    208     vst1.8      d2,[r6]!
    209     vst1.8      d1,[r6]!
    210     vst1.8      d0,[r6]!
    211 
    212     b           end_loop_copy
    213 loop_copy_8:
    214     vld1.8      d0,[r1]
    215     vrev64.16   d0,d0
    216     vst1.8      d0,[r6]!
    217 end_loop_copy:
    218     sub         r1,#2
    219 
    220     ldrh        r11, [r1], #-2
    221     strh        r11, [r6], #2
    222 
    223     cmp         r9, #-1
    224     bge         prologue_8_16_32
    225 
    226     add         r6, sp, r4, lsl #1          @ref_temp + 2 * nt
    227     sub         r6, r6, #4                  @ref_temp + 2 * nt - 2 - 2
    228 
    229     mov         r12, #0xffffffff
    230 
    231     rsb         r9, r9, r12                 @count to take care off ref_idx
    232 
    233     add         r1, r0, r4, lsl #2          @r1 = &src[4nt]
    234 
    235     mov         r7, #128                    @inv_ang_sum
    236 
    237 loop_copy_ref_idx:
    238 
    239     add         r7, r7, r8                  @inv_ang_sum += inv_ang
    240 
    241     mov         r0,r7, lsr #8
    242     mov         r0,r0, lsl #1
    243 
    244     ldrh        r11, [r1, r0]
    245     strh        r11, [r6], #-2
    246 
    247     subs        r9, r9, #1
    248 
    249     bne         loop_copy_ref_idx
    250 
    251 prologue_8_16_32:
    252 
    253     ldr         r14, col_for_intra_chroma_addr_1
    254 ulbl4:
    255     add         r14,r14,pc
    256 
    257     lsr         r10, r4, #3
    258     vld1.8      d31, [r14]!
    259     mul         r10, r4, r10                @block counter (dec by #8)
    260 
    261     mov         r11, r4, lsl #1             @col counter to be inc/dec by #8
    262     vmull.s8    q11, d30, d31               @(col+1)*intra_pred_angle [0:7](col)
    263 
    264     sub         r7, r5, #11
    265     ldr         r12, idx_neg_idx_chroma_11_17_addr @load least idx table
    266 ulbl3:
    267     add         r12,r12,pc
    268 
    269     add         r12, r12, r7, lsl #4
    270     mov         r8, r12
    271 
    272     mov         r7, #8
    273     sub         r7, r7, r3, lsl #3          @r7 = 8-8r3
    274 
    275     ldr         r9, [r8]
    276     mov         r9,r9,lsl #1
    277     add         r1, sp, r4, lsl #1          @ref_temp + 2nt
    278 
    279     vmovn.s16   d6, q11
    280     vdup.8      d26, r9                     @least idx added to final idx values
    281     sub         r1, r1, #2                  @ref_temp + 2nt - 2
    282 
    283     add         r6, r1, r9
    284 
    285     vld1.8      {d0,d1,d2,d3}, [r6]         @stores the 32 values reqd based on indices values (from least idx)
    286     vshr.s16    q11, q11, #5
    287 
    288 @   mov     r0, #31
    289     vmov.i8     d29, #31                    @contains #31 for vand operation
    290 
    291 @   mov     r0, #32
    292     vmov.i8     d28, #32
    293 
    294     vqmovn.s16  d8, q11
    295     vshl.s8     d8, d8, #1                  @ 2 * idx
    296 
    297     vand        d6, d6, d29                 @fract values in d1/ idx values in d0
    298 
    299 @   mov     r0, #2
    300     vmov.i8     d29, #2                     @contains #2 for adding to get ref_main_idx + 1
    301 
    302     mov         r0,#0x100                   @ idx value for v is +1 of u
    303     vdup.u16    d27,r0
    304     vadd.u8     d27,d27,d29
    305     mov         r0,#0
    306 
    307     vadd.s8     d8, d8, d27                 @ref_main_idx (add row)
    308     vsub.s8     d8, d8, d26                 @ref_main_idx (row 0)
    309     vadd.s8     d9, d8, d29                 @ref_main_idx + 1 (row 0)
    310     vtbl.8      d12, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 0)
    311     vsub.s8     d7, d28, d6                 @32-fract
    312 
    313     vtbl.8      d13, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 0)
    314     vadd.s8     d4, d8, d29                 @ref_main_idx (row 1)
    315     vadd.s8     d5, d9, d29                 @ref_main_idx + 1 (row 1)
    316 
    317 @   mov     r0, #4              @ 2 *(row * 2 )
    318     vmov.i8     d29, #4
    319 
    320     vtbl.8      d16, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 1)
    321     vmull.u8    q12, d12, d7                @mul (row 0)
    322     vmlal.u8    q12, d13, d6                @mul (row 0)
    323 
    324     vtbl.8      d17, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 1)
    325     vadd.s8     d8, d8, d29                 @ref_main_idx (row 2)
    326     vadd.s8     d9, d9, d29                 @ref_main_idx + 1 (row 2)
    327 
    328     vrshrn.i16  d24, q12, #5                @round shft (row 0)
    329 
    330     vtbl.8      d14, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 2)
    331     vmull.u8    q11, d16, d7                @mul (row 1)
    332     vmlal.u8    q11, d17, d6                @mul (row 1)
    333 
    334     vtbl.8      d15, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 2)
    335     vadd.s8     d4, d4, d29                 @ref_main_idx (row 3)
    336     vadd.s8     d5, d5, d29                 @ref_main_idx + 1 (row 3)
    337 
    338     vst1.8      d24, [r2], r3               @st (row 0)
    339     vrshrn.i16  d22, q11, #5                @round shft (row 1)
    340 
    341     vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 3)
    342     vmull.u8    q10, d14, d7                @mul (row 2)
    343     vmlal.u8    q10, d15, d6                @mul (row 2)
    344 
    345     vtbl.8      d11, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 3)
    346     vadd.s8     d8, d8, d29                 @ref_main_idx (row 4)
    347     vadd.s8     d9, d9, d29                 @ref_main_idx + 1 (row 4)
    348 
    349     vst1.8      d22, [r2], r3               @st (row 1)
    350     vrshrn.i16  d20, q10, #5                @round shft (row 2)
    351 
    352     vtbl.8      d12, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 4)
    353     vmull.u8    q9, d10, d7                 @mul (row 3)
    354     vmlal.u8    q9, d11, d6                 @mul (row 3)
    355 
    356     vtbl.8      d13, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 4)
    357     vadd.s8     d4, d4, d29                 @ref_main_idx (row 5)
    358     vadd.s8     d5, d5, d29                 @ref_main_idx + 1 (row 5)
    359 
    360     vst1.8      d20, [r2], r3               @st (row 2)
    361     vrshrn.i16  d18, q9, #5                 @round shft (row 3)
    362 
    363     vtbl.8      d16, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 5)
    364     vmull.u8    q12, d12, d7                @mul (row 4)
    365     vmlal.u8    q12, d13, d6                @mul (row 4)
    366 
    367     vtbl.8      d17, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 5)
    368     vadd.s8     d8, d8, d29                 @ref_main_idx (row 6)
    369     vadd.s8     d9, d9, d29                 @ref_main_idx + 1 (row 6)
    370 
    371     vst1.8      d18, [r2], r3               @st (row 3)
    372     cmp         r4,#4
    373     beq         end_func
    374     vrshrn.i16  d24, q12, #5                @round shft (row 4)
    375 
    376     vtbl.8      d14, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 6)
    377     vmull.u8    q11, d16, d7                @mul (row 5)
    378     vmlal.u8    q11, d17, d6                @mul (row 5)
    379 
    380     vtbl.8      d15, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 6)
    381     vadd.s8     d4, d4, d29                 @ref_main_idx (row 7)
    382     vadd.s8     d5, d5, d29                 @ref_main_idx + 1 (row 7)
    383 
    384     vst1.8      d24, [r2], r3               @st (row 4)
    385     vrshrn.i16  d22, q11, #5                @round shft (row 5)
    386 
    387     vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 7)
    388     vmull.u8    q10, d14, d7                @mul (row 6)
    389     vmlal.u8    q10, d15, d6                @mul (row 6)
    390 
    391     vtbl.8      d11, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 7)
    392     vmull.u8    q9, d10, d7                 @mul (row 7)
    393     vmlal.u8    q9, d11, d6                 @mul (row 7)
    394 
    395     vst1.8      d22, [r2], r3               @st (row 5)
    396     vrshrn.i16  d20, q10, #5                @round shft (row 6)
    397     vrshrn.i16  d18, q9, #5                 @round shft (row 7)
    398 
    399     vst1.8      d20, [r2], r3               @st (row 6)
    400 
    401     subs        r10, r10, #4                @subtract 8 and go to end if 8x8
    402 
    403     vst1.8      d18, [r2], r3               @st (row 7)
    404 
    405     beq         end_func
    406 
    407     subs        r11, r11, #8
    408     addgt       r8, r8, #4
    409     addgt       r2, r2, r7
    410     movle       r8, r12
    411     suble       r2, r2, r4
    412     addle       r2, r2, #8
    413     movle       r11, r4, lsl #1
    414     ldrle       r14, col_for_intra_chroma_addr_2
    415 ulbl5:
    416     addle       r14,r14,pc
    417     addle       r0, r0, #8
    418 
    419     vld1.8      d31, [r14]!
    420     vmull.s8    q6, d30, d31                @(col+1)*intra_pred_angle [0:7](col)
    421     vmovn.s16   d10, q6
    422     vshr.s16    q6, q6, #5
    423     vqmovn.s16  d11, q6
    424     vshl.s8     d11, d11, #1
    425     orr         r5,r0,r0, lsl#8
    426     add         r5,#0x002
    427     add         r5,#0x300
    428     vdup.u16    d27, r5                     @row value inc or reset accordingly
    429     ldr         r9, [r8]
    430     mov         r9,r9,lsl #1
    431     add         r9, r9, r0, lsl #1
    432 @   sub     r9, r9, #1
    433     vdup.8      d26, r9
    434     vadd.s8     d8, d27, d11                @ref_main_idx (add row)
    435     mov         r5,r2
    436 
    437 @   sub     r4,r4,#8
    438 
    439 kernel_8_16_32:
    440     vmov.i8     d29, #2                     @contains #2 for adding to get ref_main_idx + 1
    441 
    442     vsub.s8     d8, d8, d26                 @ref_main_idx
    443     vmov        d26,d10
    444 
    445     subs        r11, r11, #8
    446     add         r6, r1, r9
    447     vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 7)
    448     vadd.s8     d9, d29, d8                 @ref_main_idx + 1
    449 
    450     vmull.u8    q10, d14, d7                @mul (row 6)
    451     vtbl.8      d11, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 7)
    452     vmlal.u8    q10, d15, d6                @mul (row 6)
    453 
    454     addle       r0, r0, #8
    455     addgt       r8, r8, #4
    456     vld1.8      {d0,d1,d2,d3}, [r6]         @stores the 32 values reqd based on indices values (from least idx)
    457 
    458     vst1.8      d24, [r5], r3               @st (row 4)
    459     vrshrn.i16  d24, q11, #5                @round shft (row 5)
    460 
    461     movle       r8, r12
    462     orr         r9,r0,r0, lsl#8
    463     mov         r9,r9,lsl #1
    464     add         r9,#0x002
    465     add         r9,#0x300
    466     vdup.u16    d27, r9                     @row value inc or reset accordingly
    467 
    468     ldrle       r14, col_for_intra_chroma_addr_3
    469 ulbl6:
    470     addle       r14,r14,pc
    471 
    472     vadd.s8     d4, d29, d8                 @ref_main_idx (row 1)
    473     vtbl.8      d12, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 0)
    474     vadd.s8     d5, d29, d9                 @ref_main_idx + 1 (row 1)
    475 
    476     vmov.i8     d29, #31                    @contains #2 for adding to get ref_main_idx + 1
    477 
    478     vmull.u8    q9, d10, d7                 @mul (row 7)
    479     vtbl.8      d13, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 0)
    480     vmlal.u8    q9, d11, d6                 @mul (row 7)
    481 
    482     vld1.8      d31, [r14]!
    483     vand        d6, d29, d26                @fract values in d1/ idx values in d0
    484 
    485     vmov.i8     d29, #4                     @contains #2 for adding to get ref_main_idx + 1
    486 
    487     vst1.8      d24, [r5], r3               @(from previous loop)st (row 5)
    488     vrshrn.i16  d20, q10, #5                @(from previous loop)round shft (row 6)
    489 
    490     vadd.s8     d8, d29, d8                 @ref_main_idx (row 2)
    491     vtbl.8      d16, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 1)
    492     vadd.s8     d9, d29, d9                 @ref_main_idx + 1 (row 2)
    493 
    494     movle       r11, r4,lsl #1
    495     ldr         r9, [r8]
    496     mov         r9,r9,lsl #1
    497     vsub.s8     d7, d28, d6                 @32-fract
    498 
    499     vmull.u8    q12, d12, d7                @mul (row 0)
    500     vtbl.8      d17, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 1)
    501     vmlal.u8    q12, d13, d6                @mul (row 0)
    502 
    503     vst1.8      d20, [r5], r3               @(from previous loop)st (row 6)
    504     vrshrn.i16  d18, q9, #5                 @(from previous loop)round shft (row 7)
    505 
    506     vadd.s8     d4, d4, d29                 @ref_main_idx (row 3)
    507     vtbl.8      d14, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 2)
    508     vadd.s8     d5, d5, d29                 @ref_main_idx + 1 (row 3)
    509 
    510     vmull.u8    q11, d16, d7                @mul (row 1)
    511     vtbl.8      d15, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 2)
    512     vmlal.u8    q11, d17, d6                @mul (row 1)
    513 
    514     vrshrn.i16  d24, q12, #5                @round shft (row 0)
    515     vst1.8      d18, [r5], r3               @(from previous loop)st (row 7)
    516 
    517     vadd.s8     d8, d8, d29                 @ref_main_idx (row 4)
    518     vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 3)
    519     vadd.s8     d9, d9, d29                 @ref_main_idx + 1 (row 4)
    520 
    521     vmull.u8    q10, d14, d7                @mul (row 2)
    522     vtbl.8      d11, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 3)
    523     vmlal.u8    q10, d15, d6                @mul (row 2)
    524 
    525     vmull.s8    q7, d30, d31                @(col+1)*intra_pred_angle [0:7](col)
    526     add         r5,r2,r3,lsl#2
    527     add         r9, r9, r0, lsl #1
    528 
    529 
    530     vst1.8      d24, [r2], r3               @st (row 0)
    531     vrshrn.i16  d22, q11, #5                @round shft (row 1)
    532 
    533     vadd.s8     d4, d4, d29                 @ref_main_idx (row 5)
    534     vtbl.8      d12, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 4)
    535     vadd.s8     d5, d5, d29                 @ref_main_idx + 1 (row 5)
    536 
    537     vmull.u8    q9, d10, d7                 @mul (row 3)
    538     vtbl.8      d13, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 4)
    539     vmlal.u8    q9, d11, d6                 @mul (row 3)
    540 
    541     vst1.8      d22, [r2], r3               @st (row 1)
    542     vrshrn.i16  d20, q10, #5                @round shft (row 2)
    543 
    544     vmovn.s16   d10, q7
    545     vshr.s16    q7, q7, #5
    546 
    547     vadd.s8     d8, d8, d29                 @ref_main_idx (row 6)
    548     vtbl.8      d16, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 5)
    549     vadd.s8     d9, d9, d29                 @ref_main_idx + 1 (row 6)
    550 
    551     vmull.u8    q12, d12, d7                @mul (row 4)
    552     vtbl.8      d17, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 5)
    553     vmlal.u8    q12, d13, d6                @mul (row 4)
    554 
    555     vst1.8      d20, [r2], r3               @st (row 2)
    556     vrshrn.i16  d18, q9, #5                 @round shft (row 3)
    557 
    558 @   sub     r9, r9, #1
    559     vqmovn.s16  d11, q7
    560 
    561     vadd.s8     d4, d4, d29                 @ref_main_idx (row 7)
    562     vtbl.8      d14, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 6)
    563     vadd.s8     d5, d5, d29                 @ref_main_idx + 1 (row 7)
    564 
    565     vshl.u8     d11,#1
    566 
    567     vmull.u8    q11, d16, d7                @mul (row 5)
    568     vtbl.8      d15, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 6)
    569     vmlal.u8    q11, d17, d6                @mul (row 5)
    570 
    571     vadd.s8     d8, d27, d11                @ref_main_idx (add row)
    572     vdup.8      d26, r9
    573 
    574     vst1.8      d18, [r2], r3               @st (row 3)
    575     vrshrn.i16  d24, q12, #5                @round shft (row 4)
    576 
    577 
    578     add         r2,r3, lsl #2
    579     addgt       r2, r7, r2
    580     suble       r2, r2, r4, lsl #1
    581     addle       r2,r2,#8
    582 
    583     subs        r10, r10, #4                @subtract 8 and go to end if 8x8
    584 
    585     bne         kernel_8_16_32
    586 epil_8_16_32:
    587 
    588     vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 7)
    589 
    590     vmull.u8    q10, d14, d7                @mul (row 6)
    591     vtbl.8      d11, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 7)
    592     vmlal.u8    q10, d15, d6                @mul (row 6)
    593 
    594     vst1.8      d24, [r5], r3               @st (row 4)
    595     vrshrn.i16  d24, q11, #5                @round shft (row 5)
    596 
    597     vmull.u8    q9, d10, d7                 @mul (row 7)
    598     vmlal.u8    q9, d11, d6                 @mul (row 7)
    599 
    600     vst1.8      d24, [r5], r3               @(from previous loop)st (row 5)
    601     vrshrn.i16  d20, q10, #5                @(from previous loop)round shft (row 6)
    602 
    603     vst1.8      d20, [r5], r3               @(from previous loop)st (row 6)
    604     vrshrn.i16  d18, q9, #5                 @(from previous loop)round shft (row 7)
    605 
    606     vst1.8      d18, [r5], r3               @st (row 7)
    607 
    608 end_func:
    609     add         sp, sp, #132
    610     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    611 
    612 
    613 
    614 
    615 
    616 
    617