Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* @file
     21 @*  ihevc_intra_pred_chroma_mode_3_to_9.s
     22 @*
     23 @* @brief
     24 @*  contains function definitions for intra prediction dc filtering.
     25 @* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 @* rvct
     28 @*
     29 @* @author
     30 @*  parthiban v
     31 @*
     32 @* @par list of functions:
     33 @*
     34 @*
     35 @* @remarks
     36 @*  none
     37 @*
     38 @*******************************************************************************
     39 @*/
     40 @/**
     41 @*******************************************************************************
     42 @*
     43 @* @brief
     44 @*    luma intraprediction filter for dc input
     45 @*
     46 @* @par description:
     47 @*
     48 @* @param[in] pu1_ref
     49 @*  uword8 pointer to the source
     50 @*
     51 @* @param[out] pu1_dst
     52 @*  uword8 pointer to the destination
     53 @*
     54 @* @param[in] src_strd
     55 @*  integer source stride
     56 @*
     57 @* @param[in] dst_strd
     58 @*  integer destination stride
     59 @*
     60 @* @param[in] nt
     61 @*  size of tranform block
     62 @*
     63 @* @param[in] mode
     64 @*  type of filtering
     65 @*
     66 @* @returns
     67 @*
     68 @* @remarks
     69 @*  none
     70 @*
     71 @*******************************************************************************
     72 @*/
     73 @void ihevc_intra_pred_chroma_mode_3_to_9(uword8 *pu1_ref,
     74 @                                       word32 src_strd,
     75 @                                       uword8 *pu1_dst,
     76 @                                       word32 dst_strd,
     77 @                                       word32 nt,
     78 @                                       word32 mode)
     79 @**************variables vs registers*****************************************
     80 @r0 => *pu1_ref
     81 @r1 => src_strd
     82 @r2 => *pu1_dst
     83 @r3 => dst_strd
     84 
     85 @stack contents from #40
     86 @   nt
     87 @   mode
     88 
     89 .text
     90 .align 4
     91 
     92 
     93 
     94 
     95 
     96 .globl ihevc_intra_pred_chroma_mode_3_to_9_a9q
     97 .extern gai4_ihevc_ang_table
     98 .extern gai4_ihevc_inv_ang_table
     99 .extern col_for_intra_chroma
    100 .extern idx_neg_idx_chroma_3_9
    101 
    102 gai4_ihevc_ang_table_addr:
    103 .long gai4_ihevc_ang_table - ulbl1 - 8
    104 
    105 gai4_ihevc_inv_ang_table_addr:
    106 .long gai4_ihevc_inv_ang_table - ulbl2 - 8
    107 
    108 
    109 idx_neg_idx_chroma_3_9_addr:
    110 .long idx_neg_idx_chroma_3_9 - ulbl3 - 8
    111 
    112 col_for_intra_chroma_addr_1:
    113 .long col_for_intra_chroma - ulbl4 - 8
    114 
    115 col_for_intra_chroma_addr_2:
    116 .long col_for_intra_chroma - ulbl5 - 8
    117 
    118 col_for_intra_chroma_addr_3:
    119 .long col_for_intra_chroma - ulbl6 - 8
    120 
    121 .type ihevc_intra_pred_chroma_mode_3_to_9_a9q, %function
    122 
    123 ihevc_intra_pred_chroma_mode_3_to_9_a9q:
    124 
    125     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    126 
    127     ldr         r4,[sp,#40]                 @loads nt
    128     ldr         r7, gai4_ihevc_ang_table_addr
    129 ulbl1:
    130     add         r7,r7,pc
    131 
    132     ldr         r5,[sp,#44]                 @mode (3 to 9)
    133     ldr         r8, gai4_ihevc_inv_ang_table_addr
    134 ulbl2:
    135     add         r8,r8,pc
    136 
    137     add         r7, r7, r5, lsl #2          @gai4_ihevc_ang_table[mode]
    138     ldr         r7, [r7]                    @intra_pred_ang
    139     vdup.8      d30, r7                     @intra_pred_ang
    140 
    141     ldr         r14, col_for_intra_chroma_addr_1
    142 ulbl4:
    143     add         r14,r14,pc
    144 
    145 prologue_8_16_32:
    146     lsr         r10, r4, #3
    147     vld1.8      d31, [r14]!
    148     mul         r10, r4, r10                @block counter (dec by #8)
    149 
    150     mov         r11, r4, lsl #1             @col counter to be inc/dec by #8
    151     vmull.s8    q11, d30, d31               @(col+1)*intra_pred_angle [0:7](col)
    152 
    153     sub         r7, r5, #3
    154     ldr         r12, idx_neg_idx_chroma_3_9_addr @load most idx table
    155 ulbl3:
    156     add         r12,r12,pc
    157 
    158     add         r12, r12, r7, lsl #4
    159     mov         r8, r12
    160 
    161     mov         r7, #8
    162     sub         r7, r7, r3, lsl #3          @r7 = 8-8r3
    163 
    164     ldr         r9, [r8]
    165     mov         r9, r9, lsl #1
    166     add         r1, r0, r4, lsl #2          @pu1_ref + 4*nt
    167 
    168     vmovn.s16   d6, q11
    169     vdup.8      d26, r9                     @most idx added to final idx values
    170     sub         r1, r1, #26                 @ref_main_idx + 2nt - (8 + 1)(two_nt - idx - row ) for 8 & 8 - 1row
    171 
    172     sub         r6, r1, r9
    173 
    174     vld1.8      {d0,d1,d2,d3}, [r6]         @stores the 32 values reqd based on indices values (from most idx)
    175     vshr.s16    q11, q11, #5
    176 
    177     vmov.i8     d29, #31                    @contains #31 for vand operation
    178 
    179     vmov.i8     d28, #32
    180 
    181     vqmovn.s16  d8, q11
    182     vshl.s8     d8, d8, #1                  @ 2 * idx
    183 
    184     vand        d6, d6, d29                 @fract values in d1/ idx values in d0
    185     vmov.i8     d29, #2                     @contains #2 for adding to get ref_main_idx + 1
    186 
    187     movw        r0,#0x302                   @ idx value for v is +1 of u
    188     vdup.u16    d27,r0
    189     mov         r0,#0
    190 
    191     vmov.i8     d9, #22                     @row 0 to 7
    192 
    193     vsub.s8     d8, d8, d27                 @ref_main_idx (sub row)
    194     vsub.s8     d8, d26, d8                 @ref_main_idx (row 0)
    195     vadd.s8     d8, d8, d9                  @to compensate the pu1_src idx incremented by 8
    196     vsub.s8     d9, d8, d29                 @ref_main_idx + 1 (row 0)
    197     vtbl.8      d12, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 0)
    198     vsub.s8     d7, d28, d6                 @32-fract
    199 
    200     vtbl.8      d13, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 0)
    201     vsub.s8     d4, d8, d29                 @ref_main_idx (row 1)
    202     vsub.s8     d5, d9, d29                 @ref_main_idx + 1 (row 1)
    203 
    204     vmov.i8     d29, #4
    205 
    206     vtbl.8      d16, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 1)
    207     vmull.u8    q12, d12, d7                @mul (row 0)
    208     vmlal.u8    q12, d13, d6                @mul (row 0)
    209 
    210     vtbl.8      d17, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 1)
    211     vsub.s8     d8, d8, d29                 @ref_main_idx (row 2)
    212     vsub.s8     d9, d9, d29                 @ref_main_idx + 1 (row 2)
    213 
    214     vrshrn.i16  d24, q12, #5                @round shft (row 0)
    215 
    216     vtbl.8      d14, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 2)
    217     vmull.u8    q11, d16, d7                @mul (row 1)
    218     vmlal.u8    q11, d17, d6                @mul (row 1)
    219 
    220     vtbl.8      d15, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 2)
    221     vsub.s8     d4, d4, d29                 @ref_main_idx (row 3)
    222     vsub.s8     d5, d5, d29                 @ref_main_idx + 1 (row 3)
    223 
    224     vst1.8      d24, [r2], r3               @st (row 0)
    225     vrshrn.i16  d22, q11, #5                @round shft (row 1)
    226 
    227     vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 3)
    228     vmull.u8    q10, d14, d7                @mul (row 2)
    229     vmlal.u8    q10, d15, d6                @mul (row 2)
    230 
    231     vtbl.8      d11, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 3)
    232     vsub.s8     d8, d8, d29                 @ref_main_idx (row 4)
    233     vsub.s8     d9, d9, d29                 @ref_main_idx + 1 (row 4)
    234 
    235     vst1.8      d22, [r2], r3               @st (row 1)
    236     vrshrn.i16  d20, q10, #5                @round shft (row 2)
    237 
    238     vtbl.8      d12, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 4)
    239     vmull.u8    q9, d10, d7                 @mul (row 3)
    240     vmlal.u8    q9, d11, d6                 @mul (row 3)
    241 
    242     vtbl.8      d13, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 4)
    243     vsub.s8     d4, d4, d29                 @ref_main_idx (row 5)
    244     vsub.s8     d5, d5, d29                 @ref_main_idx + 1 (row 5)
    245 
    246     vst1.8      d20, [r2], r3               @st (row 2)
    247     vrshrn.i16  d18, q9, #5                 @round shft (row 3)
    248 
    249     vtbl.8      d16, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 5)
    250     vmull.u8    q12, d12, d7                @mul (row 4)
    251     vmlal.u8    q12, d13, d6                @mul (row 4)
    252 
    253     vtbl.8      d17, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 5)
    254     vsub.s8     d8, d8, d29                 @ref_main_idx (row 6)
    255     vsub.s8     d9, d9, d29                 @ref_main_idx + 1 (row 6)
    256 
    257     vst1.8      d18, [r2], r3               @st (row 3)
    258     cmp         r4,#4
    259     beq         end_func
    260     vrshrn.i16  d24, q12, #5                @round shft (row 4)
    261 
    262     vtbl.8      d14, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 6)
    263     vmull.u8    q11, d16, d7                @mul (row 5)
    264     vmlal.u8    q11, d17, d6                @mul (row 5)
    265 
    266     vtbl.8      d15, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 6)
    267     vsub.s8     d4, d4, d29                 @ref_main_idx (row 7)
    268     vsub.s8     d5, d5, d29                 @ref_main_idx + 1 (row 7)
    269 
    270     vst1.8      d24, [r2], r3               @st (row 4)
    271     vrshrn.i16  d22, q11, #5                @round shft (row 5)
    272 
    273     vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 7)
    274     vmull.u8    q10, d14, d7                @mul (row 6)
    275     vmlal.u8    q10, d15, d6                @mul (row 6)
    276 
    277     vtbl.8      d11, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 7)
    278     vmull.u8    q9, d10, d7                 @mul (row 7)
    279     vmlal.u8    q9, d11, d6                 @mul (row 7)
    280 
    281     vst1.8      d22, [r2], r3               @st (row 5)
    282     vrshrn.i16  d20, q10, #5                @round shft (row 6)
    283     vrshrn.i16  d18, q9, #5                 @round shft (row 7)
    284 
    285     vst1.8      d20, [r2], r3               @st (row 6)
    286 
    287     subs        r10, r10, #4                @subtract 8 and go to end if 8x8
    288 
    289     vst1.8      d18, [r2], r3               @st (row 7)
    290 
    291     beq         end_func
    292 
    293     subs        r11, r11, #8                @decrement the processed col
    294     addgt       r8, r8, #4
    295     addgt       r2, r2, r7
    296     movle       r8, r12
    297     suble       r2, r2, r4
    298     addle       r2, r2, #8
    299     movle       r11, r4, lsl #1
    300     ldrle       r14, col_for_intra_chroma_addr_2
    301 ulbl5:
    302     addle       r14,r14,pc
    303     addle       r0, r0, #8
    304 
    305     vld1.8      d31, [r14]!
    306     vmull.s8    q6, d30, d31                @(col+1)*intra_pred_angle [0:7](col)
    307     vmovn.s16   d10, q6
    308     vshr.s16    q6, q6, #5
    309     vqmovn.s16  d11, q6
    310     vshl.s8     d11, d11, #1
    311     movw        r5, #0x302                  @idx value for v is +1 of u
    312     vdup.u16    d27, r5                     @row value inc or reset accordingly
    313     ldr         r9, [r8]                    @loads index value
    314     mov         r9, r9, lsl #1
    315     mov         r5, #22
    316     sub         r5, r5, r0, lsl #1
    317     vdup.8      d16, r5
    318     vdup.8      d26, r9
    319 
    320     mov         r5,r2
    321     vsub.s8     d11, d11, d27               @ref_main_idx (sub row)
    322 
    323 kernel_8_16_32:
    324     vmov.i8     d29, #2                     @contains #2 for adding to get ref_main_idx + 1
    325     vsub.s8     d8, d26, d11                @ref_main_idx
    326     vmov        d26,d10
    327 
    328     subs        r11, r11, #8
    329     sub         r6, r1, r9
    330     vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 7)
    331     vadd.s8     d8, d8, d16                 @to compensate the pu1_src idx incremented by 8
    332 
    333     vmull.u8    q10, d14, d7                @mul (row 6)
    334     vtbl.8      d11, {d0,d1,d2,d3}, d5      @load from ref_main_idx - 1 (row 7)
    335     vmlal.u8    q10, d15, d6                @mul (row 6)
    336 
    337     addle       r0, r0, #8
    338     vsub.s8     d9, d8, d29                 @ref_main_idx - 2
    339     addgt       r8, r8, #4
    340 
    341     vld1.8      {d0,d1,d2,d3}, [r6]         @stores the 32 values reqd based on indices values (from most idx)
    342     vrshrn.i16  d22, q11, #5                @round shft (row 5)
    343 
    344     ldrle       r14, col_for_intra_chroma_addr_3
    345 ulbl6:
    346     addle       r14,r14,pc
    347     vst1.8      d24, [r5], r3               @st (row 4)
    348     movle       r8, r12
    349 
    350     movw        r9,#0x302
    351     vdup.16     d27, r9                     @row value inc or reset accordingly
    352     vsub.s8     d4, d8, d29                 @ref_main_idx (row 1)
    353 
    354     vsub.s8     d5, d9, d29                 @ref_main_idx - 1 (row 1)
    355     vtbl.8      d12, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 0)
    356     vmov.i8     d29, #31                    @contains #2 for adding to get ref_main_idx + 1
    357 
    358     vmull.u8    q9, d10, d7                 @mul (row 7)
    359     vtbl.8      d13, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 0)
    360     vmlal.u8    q9, d11, d6                 @mul (row 7)
    361 
    362     vld1.8      d31, [r14]!
    363     vand        d6, d29, d26                @fract values in d1/ idx values in d0
    364 
    365     movle       r11, r4, lsl #1
    366     vmov.i8     d29, #4                     @contains #2 for adding to get ref_main_idx + 1
    367     ldr         r9, [r8]
    368 
    369     vst1.8      d22, [r5], r3               @(from previous loop)st (row 5)
    370     vrshrn.i16  d20, q10, #5                @(from previous loop)round shft (row 6)
    371 
    372     vsub.s8     d8, d8, d29                 @ref_main_idx (row 2)
    373     vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 1)
    374     vsub.s8     d9, d9, d29                 @ref_main_idx - 1 (row 2)
    375 
    376     mov         r9,r9,lsl #1
    377     vsub.s8     d7, d28, d6                 @32-fract
    378 
    379     vmull.u8    q12, d12, d7                @mul (row 0)
    380     vtbl.8      d17, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 1)
    381     vmlal.u8    q12, d13, d6                @mul (row 0)
    382 
    383     vst1.8      d20, [r5], r3               @(from previous loop)st (row 6)
    384     vrshrn.i16  d18, q9, #5                 @(from previous loop)round shft (row 7)
    385 
    386     vsub.s8     d4, d4, d29                 @ref_main_idx (row 3)
    387     vtbl.8      d14, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 2)
    388     vsub.s8     d5, d5, d29                 @ref_main_idx - 1 (row 3)
    389 
    390     vmull.u8    q11, d10, d7                @mul (row 1)
    391     vtbl.8      d15, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 2)
    392     vmlal.u8    q11, d17, d6                @mul (row 1)
    393 
    394     vrshrn.i16  d24, q12, #5                @round shft (row 0)
    395     vst1.8      d18, [r5], r3               @(from previous loop)st (row 7)
    396 
    397     vsub.s8     d8, d8, d29                 @ref_main_idx (row 4)
    398     vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 3)
    399     vsub.s8     d9, d9, d29                 @ref_main_idx - 1 (row 4)
    400 
    401     vmull.u8    q10, d14, d7                @mul (row 2)
    402     vtbl.8      d11, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 3)
    403     vmlal.u8    q10, d15, d6                @mul (row 2)
    404 
    405     add         r5,r2,r3,lsl#2
    406     vmull.s8    q7, d30, d31                @(col+1)*intra_pred_angle [0:7](col)
    407     add         r9, r9, r0, lsl #1
    408 
    409     vst1.8      d24, [r2], r3               @st (row 0)
    410     vrshrn.i16  d22, q11, #5                @round shft (row 1)
    411 
    412     vsub.s8     d4, d4, d29                 @ref_main_idx (row 5)
    413     vtbl.8      d12, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 4)
    414     vsub.s8     d5, d5, d29                 @ref_main_idx - 1 (row 5)
    415 
    416     vmull.u8    q9, d10, d7                 @mul (row 3)
    417     vtbl.8      d13, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 4)
    418     vmlal.u8    q9, d11, d6                 @mul (row 3)
    419 
    420     vst1.8      d22, [r2], r3               @st (row 1)
    421     vrshrn.i16  d20, q10, #5                @round shft (row 2)
    422 
    423     vmovn.s16   d10, q7
    424     vshr.s16    q7, q7, #5
    425 
    426     vsub.s8     d8, d8, d29                 @ref_main_idx (row 6)
    427     vtbl.8      d21, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 5)
    428     vsub.s8     d9, d9, d29                 @ref_main_idx - 1 (row 6)
    429 
    430     vmull.u8    q12, d12, d7                @mul (row 4)
    431     vtbl.8      d17, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 5)
    432     vqmovn.s16  d11, q7
    433 
    434     vst1.8      d20, [r2], r3               @st (row 2)
    435     vmlal.u8    q12, d13, d6                @mul (row 4)
    436 
    437     vrshrn.i16  d18, q9, #5                 @round shft (row 3)
    438     vdup.8      d26, r9
    439 
    440     vsub.s8     d4, d4, d29                 @ref_main_idx (row 7)
    441     vtbl.8      d14, {d0,d1,d2,d3}, d8      @load from ref_main_idx (row 6)
    442     vsub.s8     d5, d5, d29                 @ref_main_idx - 1 (row 7)
    443 
    444     mov         r6, #22                     @to compensate the 2*row value
    445     vshl.u8     d11,#1
    446     sub         r6, r6, r0, lsl #1
    447 
    448     vmull.u8    q11, d21, d7                @mul (row 5)
    449     vtbl.8      d15, {d0,d1,d2,d3}, d9      @load from ref_main_idx + 1 (row 6)
    450     vmlal.u8    q11, d17, d6                @mul (row 5)
    451 
    452     vst1.8      d18, [r2], r3               @st (row 3)
    453     vrshrn.i16  d24, q12, #5                @round shft (row 4)
    454 
    455     add         r2,r2,r3, lsl #2
    456     vdup.8      d16, r6
    457     addgt       r2, r7, r2
    458 
    459     suble       r2, r2, r4
    460     vsub.s8     d11, d11, d27               @ref_main_idx (add row)
    461     suble       r2,r2,#8
    462 
    463     subs        r10, r10, #4                @subtract 8 and go to end if 8x8
    464 
    465     bne         kernel_8_16_32
    466 
    467 epil_8_16_32:
    468     vtbl.8      d10, {d0,d1,d2,d3}, d4      @load from ref_main_idx (row 7)
    469 
    470     vmull.u8    q10, d14, d7                @mul (row 6)
    471     vtbl.8      d11, {d0,d1,d2,d3}, d5      @load from ref_main_idx + 1 (row 7)
    472     vmlal.u8    q10, d15, d6                @mul (row 6)
    473 
    474     vst1.8      d24, [r5], r3               @st (row 4)
    475     vrshrn.i16  d24, q11, #5                @round shft (row 5)
    476 
    477     vmull.u8    q9, d10, d7                 @mul (row 7)
    478     vmlal.u8    q9, d11, d6                 @mul (row 7)
    479 
    480     vst1.8      d24, [r5], r3               @(from previous loop)st (row 5)
    481     vrshrn.i16  d20, q10, #5                @(from previous loop)round shft (row 6)
    482 
    483     vst1.8      d20, [r5], r3               @(from previous loop)st (row 6)
    484     vrshrn.i16  d18, q9, #5                 @(from previous loop)round shft (row 7)
    485 
    486     vst1.8      d18, [r5], r3               @st (row 7)
    487 
    488 end_func:
    489     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    490 
    491 
    492 
    493 
    494 
    495 
    496 
    497 
    498