Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* @file
     21 @*  ihevc_intra_pred_filters_planar.s
     22 @*
     23 @* @brief
     24 @*  contains function definitions for inter prediction  interpolation.
     25 @* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 @* rvct
     28 @*
     29 @* @author
     30 @*  akshaya mukund
     31 @*
     32 @* @par list of functions:
     33 @*
     34 @*
     35 @* @remarks
     36 @*  none
     37 @*
     38 @*******************************************************************************
     39 @*/
     40 @/**
     41 @*******************************************************************************
     42 @*
     43 @* @brief
     44 @*    luma intraprediction filter for planar input
     45 @*
     46 @* @par description:
     47 @*
     48 @* @param[in] pu1_ref
     49 @*  uword8 pointer to the source
     50 @*
     51 @* @param[out] pu1_dst
     52 @*  uword8 pointer to the destination
     53 @*
     54 @* @param[in] src_strd
     55 @*  integer source stride
     56 @*
     57 @* @param[in] dst_strd
     58 @*  integer destination stride
     59 @*
     60 @* @param[in] pi1_coeff
     61 @*  word8 pointer to the planar coefficients
     62 @*
     63 @* @param[in] nt
     64 @*  size of tranform block
     65 @*
     66 @* @param[in] mode
     67 @*  type of filtering
     68 @*
     69 @* @returns
     70 @*
     71 @* @remarks
     72 @*  none
     73 @*
     74 @*******************************************************************************
     75 @*/
     76 
     77 @void ihevc_intra_pred_luma_planar(uword8* pu1_ref,
     78 @                                  word32 src_strd,
     79 @                                  uword8* pu1_dst,
     80 @                                  word32 dst_strd,
     81 @                                  word32 nt,
     82 @                                  word32 mode,
     83 @                  word32 pi1_coeff)
     84 @**************variables vs registers*****************************************
     85 @r0 => *pu1_ref
     86 @r1 => src_strd
     87 @r2 => *pu1_dst
     88 @r3 => dst_strd
     89 
     90 @stack contents from #40
     91 @   nt
     92 @   mode
     93 @   pi1_coeff
     94 
     95 .text
     96 .align 4
     97 
     98 
     99 
    100 
    101 .globl ihevc_intra_pred_chroma_planar_a9q
    102 .extern gau1_ihevc_planar_factor
    103 
    104 gau1_ihevc_planar_factor_addr:
    105 .long gau1_ihevc_planar_factor - ulbl1 - 8
    106 
    107 .type ihevc_intra_pred_chroma_planar_a9q, %function
    108 
    109 ihevc_intra_pred_chroma_planar_a9q:
    110 
    111     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    112 
    113     ldr         r4,[sp,#40]                 @loads nt
    114     ldr         r11, gau1_ihevc_planar_factor_addr @loads table of coeffs
    115 ulbl1:
    116     add         r11,r11,pc
    117 
    118     clz         r5, r4
    119     rsb         r5, r5, #32
    120     vdup.16     q7, r5
    121     vneg.s16    q7, q7                      @shr value (so vneg)
    122     vdup.8      d2, r4                      @nt
    123     vdup.s16    q8, r4                      @nt
    124 
    125     sub         r6, r4, #1                  @nt-1
    126     add         r6, r0,r6,lsl #1            @2*(nt-1)
    127     ldr         r7, [r6]
    128     vdup.s16    d0, r7                      @src[nt-1]
    129 
    130     add         r6, r4, r4,lsl #1           @3nt
    131     add         r6, r6, #1                  @3nt + 1
    132     lsl         r6,r6,#1                    @2*(3nt + 1)
    133 
    134     add         r6, r6, r0
    135     ldr         r7, [r6]
    136     vdup.s16    d1, r7                      @src[3nt+1]
    137 
    138 
    139     add         r6, r4, r4                  @2nt
    140     add         r14, r6, #1                 @2nt+1
    141     lsl         r14,#1                      @2*(2nt+1)
    142     sub         r6, r6, #1                  @2nt-1
    143     lsl         r6,#1                       @2*(2nt-1)
    144     add         r6, r6, r0                  @&src[2nt-1]
    145     add         r14, r14, r0                @&src[2nt+1]
    146 
    147     mov         r8, #1                      @row+1 (row is first 0)
    148     sub         r9, r4, r8                  @nt-1-row (row is first 0)
    149 
    150     vdup.s8     d5, r8                      @row + 1
    151     vdup.s8     d6, r9                      @nt - 1 - row
    152     vmov        d7, d5                      @mov #1 to d7 to used for inc for row+1 and dec for nt-1-row
    153 
    154     add         r12, r11, #1                @coeffs (to be reloaded after every row)
    155     mov         r1, r4                      @nt (row counter) (dec after every row)
    156     mov         r5, r2                      @dst (to be reloaded after every row and inc by dst_strd)
    157     mov         r10, #8                     @increment for the coeffs
    158     mov         r0, r14                     @&src[2nt+1] (to be reloaded after every row)
    159 
    160     cmp         r4, #4
    161     beq         tf_sz_4
    162 
    163 
    164 
    165     mov         r10,r6
    166 tf_sz_8_16:
    167     vld1.s8     {d10,d11}, [r14]!           @load src[2nt+1+col]
    168     vld1.s8     d8, [r12]!
    169     vmov        d9,d8
    170     vzip.8      d8,d9
    171     vsub.s8     d30, d2, d8                 @[nt-1-col]
    172     vsub.s8     d31, d2, d9
    173 
    174 
    175 
    176 
    177 loop_sz_8_16:
    178 
    179     ldr         r7, [r6], #-2               @src[2nt-1-row] (dec to take into account row)
    180     vmull.u8    q6, d5, d0                  @(row+1)    *   src[nt-1]
    181     ldr         r11, [r6], #-2              @src[2nt-1-row] (dec to take into account row)
    182     vmlal.u8    q6, d6, d10                 @(nt-1-row) *   src[2nt+1+col]
    183     vdup.s16    d4, r7                      @src[2nt-1-row]
    184     vmlal.u8    q6, d8, d1                  @(col+1)    *   src[3nt+1]
    185     vdup.s16    d3, r11                     @src[2nt-1-row]
    186     vmlal.u8    q6, d30, d4                 @(nt-1-col) *   src[2nt-1-row]
    187 
    188 
    189 
    190     vmull.u8    q14,d5,d0
    191     ldr         r7, [r6], #-2               @src[2nt-1-row] (dec to take into account row)
    192     vmlal.u8    q14,d6,d11
    193     vadd.s8     d18, d5, d7                 @row++ [(row+1)++]c
    194 
    195 
    196     vmlal.u8    q14,d31,d4
    197     vsub.s8     d19, d6, d7                 @[nt-1-row]--
    198     vmlal.u8    q14,d9,d1
    199     vdup.s16    d4, r7                      @src[2nt-1-row]
    200 
    201     vmull.u8    q13, d18, d0                @(row+1)    *   src[nt-1]
    202     vadd.i16    q6, q6, q8                  @add (nt)
    203     vmlal.u8    q13, d19, d10               @(nt-1-row) *   src[2nt+1+col]
    204     vshl.s16    q6, q6, q7                  @shr
    205     vmlal.u8    q13, d8, d1                 @(col+1)    *   src[3nt+1]
    206     vadd.i16    q14,q14,q8
    207     vmlal.u8    q13, d30, d3                @(nt-1-col) *   src[2nt-1-row]
    208     vshl.s16    q14,q14,q7
    209 
    210 
    211 
    212 
    213 
    214     vmull.u8    q12,d18,d0
    215     vadd.s8     d5, d18, d7                 @row++ [(row+1)++]
    216     vmlal.u8    q12,d19,d11
    217     vsub.s8     d6, d19, d7                 @[nt-1-row]--
    218     vmlal.u8    q12,d9,d1
    219     vmovn.i16   d12, q6
    220     vmlal.u8    q12,d31,d3
    221     vmovn.i16   d13,q14
    222 
    223 
    224 
    225 
    226     vadd.i16    q13, q13, q8                @add (nt)
    227     vmull.u8    q11, d5, d0                 @(row+1)    *   src[nt-1]
    228     vshl.s16    q13, q13, q7                @shr
    229     vmlal.u8    q11, d6, d10                @(nt-1-row) *   src[2nt+1+col]
    230     vst1.s32    {d12,d13}, [r2], r3
    231     vmlal.u8    q11, d8, d1                 @(col+1)    *   src[3nt+1]
    232     vadd.i16    q12,q12,q8
    233     vmlal.u8    q11, d30, d4                @(nt-1-col) *   src[2nt-1-row]
    234     vshl.s16    q12,q12,q7
    235 
    236     vmull.u8    q10,d5,d0
    237     vadd.s8     d18, d5, d7                 @row++ [(row+1)++]c
    238     vmlal.u8    q10,d6,d11
    239     vsub.s8     d19, d6, d7                 @[nt-1-row]--
    240     vmlal.u8    q10,d31,d4
    241 
    242     ldr         r11, [r6], #-2              @src[2nt-1-row] (dec to take into account row)
    243     vmlal.u8    q10,d9,d1
    244     vdup.s16    d3, r11                     @src[2nt-1-row]
    245     vadd.i16    q11, q11, q8                @add (nt)
    246 
    247     vmull.u8    q6, d18, d0                 @(row+1)    *   src[nt-1]
    248     vmovn.i16   d26, q13
    249     vmlal.u8    q6, d19, d10                @(nt-1-row) *   src[2nt+1+col]
    250     vmovn.i16   d27,q12
    251 
    252     vmlal.u8    q6, d8, d1                  @(col+1)    *   src[3nt+1]
    253     vshl.s16    q11, q11, q7                @shr
    254 
    255     vmlal.u8    q6, d30, d3                 @(nt-1-col) *   src[2nt-1-row]
    256     vadd.i16    q10,q10,q8
    257 
    258     vmull.u8    q14,d18,d0
    259     vst1.s32    {d26,d27}, [r2], r3
    260 
    261     vmlal.u8    q14,d19,d11
    262     vadd.s8     d5, d18, d7                 @row++ [(row+1)++]
    263 
    264     vsub.s8     d6, d19, d7                 @[nt-1-row]--
    265     vmlal.u8    q14,d9,d1
    266 
    267     vmlal.u8    q14,d31,d3
    268     vshl.s16    q10,q10,q7
    269 
    270 
    271     vadd.i16    q6, q6 ,q8                  @add (nt)
    272     vmovn.i16   d22, q11
    273 
    274 
    275     vadd.i16    q14,q14,q8
    276     vmovn.i16   d23,q10
    277 
    278 
    279     vshl.s16    q6, q6, q7                  @shr
    280     vst1.s32    {d22,d23}, [r2], r3
    281     vshl.s16    q14,q14,q7
    282 
    283 
    284 
    285 
    286 
    287     vmovn.i16   d20, q6
    288     vmovn.i16   d21,q14
    289 
    290     vst1.s32    {d20,d21}, [r2], r3
    291 
    292 
    293     subs        r1, r1, #4
    294 
    295     bne         loop_sz_8_16
    296 
    297 
    298 
    299 
    300     cmp         r4,#16
    301 
    302     bne         end_loop
    303 
    304 
    305     sub         r4,#16
    306     vdup.s8     d5, r8                      @row + 1
    307     vdup.s8     d6, r9                      @nt - 1 - row
    308     vmov        d7, d5                      @mov #1 to d7 to used for inc for row+1 and dec for nt-1-row
    309 
    310     mov         r6,r10
    311     mov         r1,#16
    312     sub         r2,r2,r3,lsl #4
    313     add         r2,r2,#16
    314 
    315     vld1.s8     {d10,d11}, [r14]!           @load src[2nt+1+col]
    316     vld1.s8     d8, [r12]!
    317     vmov        d9,d8
    318     vzip.8      d8,d9
    319     vsub.s8     d30, d2, d8                 @[nt-1-col]
    320     vsub.s8     d31, d2, d9
    321 
    322     beq         loop_sz_8_16
    323 
    324 
    325 
    326 tf_sz_4:
    327     vld1.s8     d10, [r14]                  @load src[2nt+1+col]
    328     vld1.s8     d8, [r12], r10              @load 8 coeffs [col+1]
    329     vmov        d9,d8
    330     vzip.8      d8,d9
    331 loop_sz_4:
    332     @mov        r10, #4             @reduce inc to #4 for 4x4
    333     ldr         r7, [r6], #-2               @src[2nt-1-row] (dec to take into account row)
    334     vdup.s16    d4, r7                      @src[2nt-1-row]
    335 
    336     vsub.s8     d9, d2, d8                  @[nt-1-col]
    337 
    338     vmull.u8    q6, d5, d0                  @(row+1)    *   src[nt-1]
    339     vmlal.u8    q6, d6, d10                 @(nt-1-row) *   src[2nt+1+col]
    340     vmlal.u8    q6, d8, d1                  @(col+1)    *   src[3nt+1]
    341     vmlal.u8    q6, d9, d4                  @(nt-1-col) *   src[2nt-1-row]
    342 @   vadd.i16    q6, q6, q8          @add (nt)
    343 @   vshl.s16    q6, q6, q7          @shr
    344 @   vmovn.i16   d12, q6
    345     vrshrn.s16  d12,q6,#3
    346 
    347     vst1.s32    {d12}, [r2], r3
    348 
    349     vadd.s8     d5, d5, d7                  @row++ [(row+1)++]
    350     vsub.s8     d6, d6, d7                  @[nt-1-row]--
    351     subs        r1, r1, #1
    352 
    353     bne         loop_sz_4
    354 
    355 end_loop:
    356     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    357 
    358 
    359 
    360 
    361 
    362 
    363 
    364