Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* @file
     21 @*  ihevc_intra_pred_chroma_mode_3_to_9.s
     22 @*
     23 @* @brief
     24 @*  contains function definitions for intra prediction dc filtering.
     25 @* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 @* rvct
     28 @*
     29 @* @author
     30 @*  parthiban v
     31 @*
     32 @* @par list of functions:
     33 @*
     34 @*
     35 @* @remarks
     36 @*  none
     37 @*
     38 @*******************************************************************************
     39 @*/
     40 @/**
     41 @*******************************************************************************
     42 @*
     43 @* @brief
     44 @*    luma intraprediction filter for dc input
     45 @*
     46 @* @par description:
     47 @*
     48 @* @param[in] pu1_ref
     49 @*  uword8 pointer to the source
     50 @*
     51 @* @param[out] pu1_dst
     52 @*  uword8 pointer to the destination
     53 @*
     54 @* @param[in] src_strd
     55 @*  integer source stride
     56 @*
     57 @* @param[in] dst_strd
     58 @*  integer destination stride
     59 @*
     60 @* @param[in] nt
     61 @*  size of tranform block
     62 @*
     63 @* @param[in] mode
     64 @*  type of filtering
     65 @*
     66 @* @returns
     67 @*
     68 @* @remarks
     69 @*  none
     70 @*
     71 @*******************************************************************************
     72 @*/
     73 @void ihevc_intra_pred_chroma_mode_3_to_9(uword8 *pu1_ref,
     74 @                                       word32 src_strd,
     75 @                                       uword8 *pu1_dst,
     76 @                                       word32 dst_strd,
     77 @                                       word32 nt,
     78 @                                       word32 mode)
     79 @**************variables vs registers*****************************************
     80 @r0 => *pu1_ref
     81 @r1 => src_strd
     82 @r2 => *pu1_dst
     83 @r3 => dst_strd
     84 
     85 @stack contents from #104
     86 @   nt
     87 @   mode
     88 
     89 .equ    nt_offset,          104
     90 .equ    mode_offset,        108
     91 
     92 .text
     93 .align 4
     94 
     95 
     96 
     97 
     98 
     99 .globl ihevc_intra_pred_chroma_mode_3_to_9_a9q
    100 .extern gai4_ihevc_ang_table
    101 .extern gai4_ihevc_inv_ang_table
    102 .extern col_for_intra_chroma
    103 .extern idx_neg_idx_chroma_3_9
    104 
    105 gai4_ihevc_ang_table_addr:
    106 .long gai4_ihevc_ang_table - ulbl1 - 8
    107 
    108 gai4_ihevc_inv_ang_table_addr:
    109 .long gai4_ihevc_inv_ang_table - ulbl2 - 8
    110 
    111 
    112 idx_neg_idx_chroma_3_9_addr:
    113 .long idx_neg_idx_chroma_3_9 - ulbl3 - 8
    114 
    115 col_for_intra_chroma_addr_1:
    116 .long col_for_intra_chroma - ulbl4 - 8
    117 
    118 col_for_intra_chroma_addr_2:
    119 .long col_for_intra_chroma - ulbl5 - 8
    120 
    121 col_for_intra_chroma_addr_3:
    122 .long col_for_intra_chroma - ulbl6 - 8
    123 
    124 .type ihevc_intra_pred_chroma_mode_3_to_9_a9q, %function
    125 
    126 ihevc_intra_pred_chroma_mode_3_to_9_a9q:
    127 
    128     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    129     vpush       {d8 - d15}
    130 
    131     ldr         r4,[sp,#nt_offset]          @loads nt
    132     ldr         r7, gai4_ihevc_ang_table_addr
    133 ulbl1:
    134     add         r7,r7,pc
    135 
    136     ldr         r5,[sp,#mode_offset]        @mode (3 to 9)
    137     ldr         r8, gai4_ihevc_inv_ang_table_addr
    138 ulbl2:
    139     add         r8,r8,pc
    140 
    141     add         r7, r7, r5, lsl #2          @gai4_ihevc_ang_table[mode]
    142     ldr         r7, [r7]                    @intra_pred_ang
    143     vdup.8      d30, r7                     @intra_pred_ang
    144 
    145     ldr         r14, col_for_intra_chroma_addr_1
    146 ulbl4:
    147     add         r14,r14,pc
    148 
    149 prologue_8_16_32:
    150     lsr         r10, r4, #3
    151     vld1.8      d31, [r14]!
    152     mul         r10, r4, r10                @block counter (dec by #8)
    153 
    154     mov         r11, r4, lsl #1             @col counter to be inc/dec by #8
    155     vmull.s8    q11, d30, d31               @(col+1)*intra_pred_angle [0:7](col)
    156 
    157     sub         r7, r5, #3
    158     ldr         r12, idx_neg_idx_chroma_3_9_addr @load most idx table
    159 ulbl3:
    160     add         r12,r12,pc
    161 
    162     add         r12, r12, r7, lsl #4
    163     mov         r8, r12
    164 
    165     mov         r7, #8
    166     sub         r7, r7, r3, lsl #3          @r7 = 8-8r3
    167 
    168     ldr         r9, [r8]
    169     mov         r9, r9, lsl #1
    170     add         r1, r0, r4, lsl #2          @pu1_ref + 4*nt
    171 
    172     vmovn.s16   d6, q11
    173     vdup.8      d26, r9                     @most idx added to final idx values
    174     sub         r1, r1, #26                 @ref_main_idx + 2nt - (8 + 1)(two_nt - idx - row ) for 8 & 8 - 1row
    175 
    176     sub         r6, r1, r9
    177 
    178     vld1.8      {d0,d1,d2,d3}, [r6]         @stores the 32 values reqd based on indices values (from most idx)
    179     vshr.s16    q11, q11, #5
    180 
    181     vmov.i8     d29, #31                    @contains #31 for vand operation
    182 
    183     vmov.i8     d28, #32
    184 
    185     vqmovn.s16  d8, q11
    186     vshl.s8     d8, d8, #1                  @ 2 * idx
    187 
    188     vand        d6, d6, d29                 @fract values in d1/ idx values in d0
    189     vmov.i8     d29, #2                     @contains #2 for adding to get ref_main_idx + 1
    190 
    191     movw        r0,#0x302                   @ idx value for v is +1 of u
    192     vdup.u16    d27,r0
    193     mov         r0,#0
    194 
    195     vmov.i8     d9, #22                     @row 0 to 7
    196 
    197     vsub.s8     d8, d8, d27                 @ref_main_idx (sub row)
    198     vsub.s8     d8, d26, d8                 @ref_main_idx (row 0)
    199     vadd.s8     d8, d8, d9                  @to compensate the pu1_src idx incremented by 8
    200     vsub.s8     d9, d8, d29                 @ref_main_idx + 1 (row 0)
    201     vtbl.8      d12, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 0)
    202     vsub.s8     d7, d28, d6                 @32-fract
    203 
    204     vtbl.8      d13, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 0)
    205     vsub.s8     d4, d8, d29                 @ref_main_idx (row 1)
    206     vsub.s8     d5, d9, d29                 @ref_main_idx + 1 (row 1)
    207 
    208     vmov.i8     d29, #4
    209 
    210     vtbl.8      d16, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 1)
    211     vmull.u8    q12, d12, d7                @mul (row 0)
    212     vmlal.u8    q12, d13, d6                @mul (row 0)
    213 
    214     vtbl.8      d17, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 1)
    215     vsub.s8     d8, d8, d29                 @ref_main_idx (row 2)
    216     vsub.s8     d9, d9, d29                 @ref_main_idx + 1 (row 2)
    217 
    218     vrshrn.i16  d24, q12, #5                @round shft (row 0)
    219 
    220     vtbl.8      d14, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 2)
    221     vmull.u8    q11, d16, d7                @mul (row 1)
    222     vmlal.u8    q11, d17, d6                @mul (row 1)
    223 
    224     vtbl.8      d15, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 2)
    225     vsub.s8     d4, d4, d29                 @ref_main_idx (row 3)
    226     vsub.s8     d5, d5, d29                 @ref_main_idx + 1 (row 3)
    227 
    228     vst1.8      d24, [r2], r3               @st (row 0)
    229     vrshrn.i16  d22, q11, #5                @round shft (row 1)
    230 
    231     vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 3)
    232     vmull.u8    q10, d14, d7                @mul (row 2)
    233     vmlal.u8    q10, d15, d6                @mul (row 2)
    234 
    235     vtbl.8      d11, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 3)
    236     vsub.s8     d8, d8, d29                 @ref_main_idx (row 4)
    237     vsub.s8     d9, d9, d29                 @ref_main_idx + 1 (row 4)
    238 
    239     vst1.8      d22, [r2], r3               @st (row 1)
    240     vrshrn.i16  d20, q10, #5                @round shft (row 2)
    241 
    242     vtbl.8      d12, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 4)
    243     vmull.u8    q9, d10, d7                 @mul (row 3)
    244     vmlal.u8    q9, d11, d6                 @mul (row 3)
    245 
    246     vtbl.8      d13, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 4)
    247     vsub.s8     d4, d4, d29                 @ref_main_idx (row 5)
    248     vsub.s8     d5, d5, d29                 @ref_main_idx + 1 (row 5)
    249 
    250     vst1.8      d20, [r2], r3               @st (row 2)
    251     vrshrn.i16  d18, q9, #5                 @round shft (row 3)
    252 
    253     vtbl.8      d16, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 5)
    254     vmull.u8    q12, d12, d7                @mul (row 4)
    255     vmlal.u8    q12, d13, d6                @mul (row 4)
    256 
    257     vtbl.8      d17, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 5)
    258     vsub.s8     d8, d8, d29                 @ref_main_idx (row 6)
    259     vsub.s8     d9, d9, d29                 @ref_main_idx + 1 (row 6)
    260 
    261     vst1.8      d18, [r2], r3               @st (row 3)
    262     cmp         r4,#4
    263     beq         end_func
    264     vrshrn.i16  d24, q12, #5                @round shft (row 4)
    265 
    266     vtbl.8      d14, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 6)
    267     vmull.u8    q11, d16, d7                @mul (row 5)
    268     vmlal.u8    q11, d17, d6                @mul (row 5)
    269 
    270     vtbl.8      d15, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 6)
    271     vsub.s8     d4, d4, d29                 @ref_main_idx (row 7)
    272     vsub.s8     d5, d5, d29                 @ref_main_idx + 1 (row 7)
    273 
    274     vst1.8      d24, [r2], r3               @st (row 4)
    275     vrshrn.i16  d22, q11, #5                @round shft (row 5)
    276 
    277     vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 7)
    278     vmull.u8    q10, d14, d7                @mul (row 6)
    279     vmlal.u8    q10, d15, d6                @mul (row 6)
    280 
    281     vtbl.8      d11, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 7)
    282     vmull.u8    q9, d10, d7                 @mul (row 7)
    283     vmlal.u8    q9, d11, d6                 @mul (row 7)
    284 
    285     vst1.8      d22, [r2], r3               @st (row 5)
    286     vrshrn.i16  d20, q10, #5                @round shft (row 6)
    287     vrshrn.i16  d18, q9, #5                 @round shft (row 7)
    288 
    289     vst1.8      d20, [r2], r3               @st (row 6)
    290 
    291     subs        r10, r10, #4                @subtract 8 and go to end if 8x8
    292 
    293     vst1.8      d18, [r2], r3               @st (row 7)
    294 
    295     beq         end_func
    296 
    297     subs        r11, r11, #8                @decrement the processed col
    298     addgt       r8, r8, #4
    299     addgt       r2, r2, r7
    300     movle       r8, r12
    301     suble       r2, r2, r4
    302     addle       r2, r2, #8
    303     movle       r11, r4, lsl #1
    304     ldrle       r14, col_for_intra_chroma_addr_2
    305 ulbl5:
    306     addle       r14,r14,pc
    307     addle       r0, r0, #8
    308 
    309     vld1.8      d31, [r14]!
    310     vmull.s8    q6, d30, d31                @(col+1)*intra_pred_angle [0:7](col)
    311     vmovn.s16   d10, q6
    312     vshr.s16    q6, q6, #5
    313     vqmovn.s16  d11, q6
    314     vshl.s8     d11, d11, #1
    315     movw        r5, #0x302                  @idx value for v is +1 of u
    316     vdup.u16    d27, r5                     @row value inc or reset accordingly
    317     ldr         r9, [r8]                    @loads index value
    318     mov         r9, r9, lsl #1
    319     mov         r5, #22
    320     sub         r5, r5, r0, lsl #1
    321     vdup.8      d16, r5
    322     vdup.8      d26, r9
    323 
    324     mov         r5,r2
    325     vsub.s8     d11, d11, d27               @ref_main_idx (sub row)
    326 
    327 kernel_8_16_32:
    328     vmov.i8     d29, #2                     @contains #2 for adding to get ref_main_idx + 1
    329     vsub.s8     d8, d26, d11                @ref_main_idx
    330     vmov        d26,d10
    331 
    332     subs        r11, r11, #8
    333     sub         r6, r1, r9
    334     vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 7)
    335     vadd.s8     d8, d8, d16                 @to compensate the pu1_src idx incremented by 8
    336 
    337     vmull.u8    q10, d14, d7                @mul (row 6)
    338     vtbl.8      d11, {d0,d1,d2,d3}, d5      @load from ref_main_idx - 1 (row 7)
    339     vmlal.u8    q10, d15, d6                @mul (row 6)
    340 
    341     addle       r0, r0, #8
    342     vsub.s8     d9, d8, d29                 @ref_main_idx - 2
    343     addgt       r8, r8, #4
    344 
    345     vld1.8      {d0,d1,d2,d3}, [r6]         @stores the 32 values reqd based on indices values (from most idx)
    346     vrshrn.i16  d22, q11, #5                @round shft (row 5)
    347 
    348     ldrle       r14, col_for_intra_chroma_addr_3
    349 ulbl6:
    350     addle       r14,r14,pc
    351     vst1.8      d24, [r5], r3               @st (row 4)
    352     movle       r8, r12
    353 
    354     movw        r9,#0x302
    355     vdup.16     d27, r9                     @row value inc or reset accordingly
    356     vsub.s8     d4, d8, d29                 @ref_main_idx (row 1)
    357 
    358     vsub.s8     d5, d9, d29                 @ref_main_idx - 1 (row 1)
    359     vtbl.8      d12, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 0)
    360     vmov.i8     d29, #31                    @contains #2 for adding to get ref_main_idx + 1
    361 
    362     vmull.u8    q9, d10, d7                 @mul (row 7)
    363     vtbl.8      d13, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 0)
    364     vmlal.u8    q9, d11, d6                 @mul (row 7)
    365 
    366     vld1.8      d31, [r14]!
    367     vand        d6, d29, d26                @fract values in d1/ idx values in d0
    368 
    369     movle       r11, r4, lsl #1
    370     vmov.i8     d29, #4                     @contains #2 for adding to get ref_main_idx + 1
    371     ldr         r9, [r8]
    372 
    373     vst1.8      d22, [r5], r3               @(from previous loop)st (row 5)
    374     vrshrn.i16  d20, q10, #5                @(from previous loop)round shft (row 6)
    375 
    376     vsub.s8     d8, d8, d29                 @ref_main_idx (row 2)
    377     vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 1)
    378     vsub.s8     d9, d9, d29                 @ref_main_idx - 1 (row 2)
    379 
    380     mov         r9,r9,lsl #1
    381     vsub.s8     d7, d28, d6                 @32-fract
    382 
    383     vmull.u8    q12, d12, d7                @mul (row 0)
    384     vtbl.8      d17, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 1)
    385     vmlal.u8    q12, d13, d6                @mul (row 0)
    386 
    387     vst1.8      d20, [r5], r3               @(from previous loop)st (row 6)
    388     vrshrn.i16  d18, q9, #5                 @(from previous loop)round shft (row 7)
    389 
    390     vsub.s8     d4, d4, d29                 @ref_main_idx (row 3)
    391     vtbl.8      d14, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 2)
    392     vsub.s8     d5, d5, d29                 @ref_main_idx - 1 (row 3)
    393 
    394     vmull.u8    q11, d10, d7                @mul (row 1)
    395     vtbl.8      d15, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 2)
    396     vmlal.u8    q11, d17, d6                @mul (row 1)
    397 
    398     vrshrn.i16  d24, q12, #5                @round shft (row 0)
    399     vst1.8      d18, [r5], r3               @(from previous loop)st (row 7)
    400 
    401     vsub.s8     d8, d8, d29                 @ref_main_idx (row 4)
    402     vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 3)
    403     vsub.s8     d9, d9, d29                 @ref_main_idx - 1 (row 4)
    404 
    405     vmull.u8    q10, d14, d7                @mul (row 2)
    406     vtbl.8      d11, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 3)
    407     vmlal.u8    q10, d15, d6                @mul (row 2)
    408 
    409     add         r5,r2,r3,lsl#2
    410     vmull.s8    q7, d30, d31                @(col+1)*intra_pred_angle [0:7](col)
    411     add         r9, r9, r0, lsl #1
    412 
    413     vst1.8      d24, [r2], r3               @st (row 0)
    414     vrshrn.i16  d22, q11, #5                @round shft (row 1)
    415 
    416     vsub.s8     d4, d4, d29                 @ref_main_idx (row 5)
    417     vtbl.8      d12, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 4)
    418     vsub.s8     d5, d5, d29                 @ref_main_idx - 1 (row 5)
    419 
    420     vmull.u8    q9, d10, d7                 @mul (row 3)
    421     vtbl.8      d13, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 4)
    422     vmlal.u8    q9, d11, d6                 @mul (row 3)
    423 
    424     vst1.8      d22, [r2], r3               @st (row 1)
    425     vrshrn.i16  d20, q10, #5                @round shft (row 2)
    426 
    427     vmovn.s16   d10, q7
    428     vshr.s16    q7, q7, #5
    429 
    430     vsub.s8     d8, d8, d29                 @ref_main_idx (row 6)
    431     vtbl.8      d21, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 5)
    432     vsub.s8     d9, d9, d29                 @ref_main_idx - 1 (row 6)
    433 
    434     vmull.u8    q12, d12, d7                @mul (row 4)
    435     vtbl.8      d17, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 5)
    436     vqmovn.s16  d11, q7
    437 
    438     vst1.8      d20, [r2], r3               @st (row 2)
    439     vmlal.u8    q12, d13, d6                @mul (row 4)
    440 
    441     vrshrn.i16  d18, q9, #5                 @round shft (row 3)
    442     vdup.8      d26, r9
    443 
    444     vsub.s8     d4, d4, d29                 @ref_main_idx (row 7)
    445     vtbl.8      d14, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 6)
    446     vsub.s8     d5, d5, d29                 @ref_main_idx - 1 (row 7)
    447 
    448     mov         r6, #22                     @to compensate the 2*row value
    449     vshl.u8     d11,#1
    450     sub         r6, r6, r0, lsl #1
    451 
    452     vmull.u8    q11, d21, d7                @mul (row 5)
    453     vtbl.8      d15, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 6)
    454     vmlal.u8    q11, d17, d6                @mul (row 5)
    455 
    456     vst1.8      d18, [r2], r3               @st (row 3)
    457     vrshrn.i16  d24, q12, #5                @round shft (row 4)
    458 
    459     add         r2,r2,r3, lsl #2
    460     vdup.8      d16, r6
    461     addgt       r2, r7, r2
    462 
    463     suble       r2, r2, r4
    464     vsub.s8     d11, d11, d27               @ref_main_idx (add row)
    465     suble       r2,r2,#8
    466 
    467     subs        r10, r10, #4                @subtract 8 and go to end if 8x8
    468 
    469     bne         kernel_8_16_32
    470 
    471 epil_8_16_32:
    472     vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 7)
    473 
    474     vmull.u8    q10, d14, d7                @mul (row 6)
    475     vtbl.8      d11, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 7)
    476     vmlal.u8    q10, d15, d6                @mul (row 6)
    477 
    478     vst1.8      d24, [r5], r3               @st (row 4)
    479     vrshrn.i16  d24, q11, #5                @round shft (row 5)
    480 
    481     vmull.u8    q9, d10, d7                 @mul (row 7)
    482     vmlal.u8    q9, d11, d6                 @mul (row 7)
    483 
    484     vst1.8      d24, [r5], r3               @(from previous loop)st (row 5)
    485     vrshrn.i16  d20, q10, #5                @(from previous loop)round shft (row 6)
    486 
    487     vst1.8      d20, [r5], r3               @(from previous loop)st (row 6)
    488     vrshrn.i16  d18, q9, #5                 @(from previous loop)round shft (row 7)
    489 
    490     vst1.8      d18, [r5], r3               @st (row 7)
    491 
    492 end_func:
    493     vpop        {d8 - d15}
    494     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    495 
    496 
    497 
    498 
    499 
    500 
    501 
    502 
    503