Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* @file
     21 @*  ihevc_intra_pred_filters_planar.s
     22 @*
     23 @* @brief
     24 @*  contains function definitions for inter prediction  interpolation.
     25 @* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 @* rvct
     28 @*
     29 @* @author
     30 @*  akshaya mukund
     31 @*
     32 @* @par list of functions:
     33 @*
     34 @*
     35 @* @remarks
     36 @*  none
     37 @*
     38 @*******************************************************************************
     39 @*/
     40 @/**
     41 @*******************************************************************************
     42 @*
     43 @* @brief
     44 @*    luma intraprediction filter for planar input
     45 @*
     46 @* @par description:
     47 @*
     48 @* @param[in] pu1_ref
     49 @*  uword8 pointer to the source
     50 @*
     51 @* @param[out] pu1_dst
     52 @*  uword8 pointer to the destination
     53 @*
     54 @* @param[in] src_strd
     55 @*  integer source stride
     56 @*
     57 @* @param[in] dst_strd
     58 @*  integer destination stride
     59 @*
     60 @* @param[in] pi1_coeff
     61 @*  word8 pointer to the planar coefficients
     62 @*
     63 @* @param[in] nt
     64 @*  size of tranform block
     65 @*
     66 @* @param[in] mode
     67 @*  type of filtering
     68 @*
     69 @* @returns
     70 @*
     71 @* @remarks
     72 @*  none
     73 @*
     74 @*******************************************************************************
     75 @*/
     76 
     77 @void ihevc_intra_pred_luma_planar(uword8* pu1_ref,
     78 @                                  word32 src_strd,
     79 @                                  uword8* pu1_dst,
     80 @                                  word32 dst_strd,
     81 @                                  word32 nt,
     82 @                                  word32 mode,
     83 @                  word32 pi1_coeff)
     84 @**************variables vs registers*****************************************
     85 @r0 => *pu1_ref
     86 @r1 => src_strd
     87 @r2 => *pu1_dst
     88 @r3 => dst_strd
     89 
     90 @stack contents from #40
     91 @   nt
     92 @   mode
     93 @   pi1_coeff
     94 
     95 .text
     96 .align 4
     97 
     98 
     99 
    100 
    101 .globl ihevc_intra_pred_luma_planar_a9q
    102 .extern gau1_ihevc_planar_factor
    103 .extern gau1_ihevc_planar_factor_1
    104 
    105 gau1_ihevc_planar_factor_addr:
    106 .long gau1_ihevc_planar_factor - ulbl1 - 8
    107 
    108 gau1_ihevc_planar_factor_1_addr:
    109 .long gau1_ihevc_planar_factor_1 - ulbl2 - 8
    110 
    111 
    112 .type ihevc_intra_pred_luma_planar_a9q, %function
    113 
    114 ihevc_intra_pred_luma_planar_a9q:
    115 
    116     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    117 
    118     ldr         r4,[sp,#40]                 @loads nt
    119     ldr         r11, gau1_ihevc_planar_factor_addr @loads table of coeffs
    120 ulbl1:
    121     add         r11,r11,pc
    122 
    123     clz         r5, r4
    124     rsb         r5, r5, #32
    125     vdup.16     q7, r5
    126     vneg.s16    q7, q7                      @shr value (so vneg)
    127     vdup.8      d2, r4                      @nt
    128     vdup.s16    q8, r4                      @nt
    129 
    130     sub         r6, r4, #1                  @nt-1
    131     add         r6, r6, r0
    132     ldr         r7, [r6]
    133     vdup.s8     d0, r7                      @src[nt-1]
    134 
    135     add         r6, r4, r4,lsl #1           @3nt
    136     add         r6, r6, #1                  @3nt + 1
    137     add         r6, r6, r0
    138     ldr         r7, [r6]
    139     vdup.s8     d1, r7                      @src[3nt+1]
    140 
    141     add         r6, r4, r4                  @2nt
    142     add         r14, r6, #1                 @2nt+1
    143     sub         r6, r6, #1                  @2nt-1
    144     add         r6, r6, r0                  @&src[2nt-1]
    145     add         r14, r14, r0                @&src[2nt+1]
    146 
    147     mov         r8, #1                      @row+1 (row is first 0)
    148     sub         r9, r4, r8                  @nt-1-row (row is first 0)
    149 
    150     vdup.s8     d5, r8                      @row + 1
    151     vdup.s8     d6, r9                      @nt - 1 - row
    152     vmov        d7, d5                      @mov #1 to d7 to used for inc for row+1 and dec for nt-1-row
    153 
    154     add         r12, r11, #1                @coeffs (to be reloaded after every row)
    155     mov         r1, r4                      @nt (row counter) (dec after every row)
    156     mov         r5, r2                      @dst (to be reloaded after every row and inc by dst_strd)
    157     mov         r10, #8                     @increment for the coeffs
    158     mov         r0, r14                     @&src[2nt+1] (to be reloaded after every row)
    159 
    160     cmp         r4, #4
    161     beq         tf_sz_4
    162 
    163 @@ ========== ***************** =====================
    164 prolog:
    165 tf_sz_8_16_32:
    166 
    167     mov         r7, r4                      @column counter (set to no of cols)
    168     mov         r9, r4, lsr #3              @divide nt by 8
    169     mul         r7, r7, r9                  @multiply width * height
    170     ldr         r5, gau1_ihevc_planar_factor_1_addr @loads table of coeffs
    171 ulbl2:
    172     add         r5,r5,pc
    173     sub         r6, r6, #7
    174     mov         r8, r2
    175     lsl         r9, r3, #3                  @4*stride
    176     rsb         r9, r9, #8                  @8-4*stride
    177     mov         r10, r4                     @nt
    178     sub         r10, r10, #8                @nt - 8
    179 
    180 col_loop_8_16_32:
    181 
    182     vld1.s8     d8, [r12]                   @(1-8)load 8 coeffs [col+1]
    183     vdup.16     q6, r4                      @(1)
    184     vld1.s8     d4, [r6]                    @(1-8)src[2nt-1-row]
    185     vsub.s8     d9, d2, d8                  @(1-8)[nt-1-col]
    186 
    187 
    188     vmlal.u8    q6, d5, d0                  @(1)(row+1) *   src[nt-1]
    189 
    190     vld1.s8     d3, [r14]                   @(1-8)load 8 src[2nt+1+col]
    191     vmlal.u8    q6, d8, d1                  @(1)(col+1) *   src[3nt+1]
    192 
    193     vdup.s8     d20, d4[7]                  @(1)
    194     vmlal.u8    q6, d6, d3                  @(1)(nt-1-row)  *   src[2nt+1+col]
    195 
    196     vdup.s8     d21, d4[6]                  @(2)
    197     vmlal.u8    q6, d9, d20                 @(1)(nt-1-col)  *   src[2nt-1-row]
    198 
    199     vdup.16     q15, r4                     @(2)
    200     vadd.s8     d5, d5, d7                  @(1)
    201 
    202     vsub.s8     d6, d6, d7                  @(1)
    203 
    204     vdup.s8     d22, d4[5]                  @(3)
    205     vmlal.u8    q15, d5, d0                 @(2)
    206 
    207     vdup.16     q14, r4                     @(3)
    208     vmlal.u8    q15, d8, d1                 @(2)
    209 
    210     vmlal.u8    q15, d6, d3                 @(2)
    211     vmlal.u8    q15, d9, d21                @(2)
    212 
    213     vshl.s16    q6, q6, q7                  @(1)shr
    214 
    215     vadd.s8     d5, d5, d7                  @(2)
    216     vsub.s8     d6, d6, d7                  @(2)
    217 
    218     vmovn.i16   d12, q6                     @(1)
    219     vmlal.u8    q14, d5, d0                 @(3)
    220 
    221     vdup.8      d23, d4[4]                  @(4)
    222     vmlal.u8    q14, d8, d1                 @(3)
    223 
    224     vdup.16     q5, r4                      @(4)
    225     vmlal.u8    q14, d6, d3                 @(3)
    226 
    227     vst1.s8     d12, [r2], r3               @(1)str 8 values
    228     vmlal.u8    q14, d9, d22                @(3)
    229 
    230     vshl.s16    q15, q15, q7                @(2)shr
    231 
    232     vadd.s8     d5, d5, d7                  @(3)
    233     vsub.s8     d6, d6, d7                  @(3)
    234 
    235     vmovn.i16   d30, q15                    @(2)
    236     vmlal.u8    q5, d5, d0                  @(4)
    237 
    238     vdup.8      d20, d4[3]                  @(5)
    239     vmlal.u8    q5, d8, d1                  @(4)
    240 
    241     vdup.16     q8, r4                      @(5)
    242     vmlal.u8    q5, d6, d3                  @(4)
    243 
    244     vst1.s8     d30, [r2], r3               @(2)str 8 values
    245     vmlal.u8    q5, d9, d23                 @(4)
    246 
    247     vshl.s16    q14, q14, q7                @(3)shr
    248 
    249     vadd.s8     d5, d5, d7                  @(4)
    250     vsub.s8     d6, d6, d7                  @(4)
    251 
    252     vmovn.i16   d28, q14                    @(3)
    253     vmlal.u8    q8, d5, d0                  @(5)
    254 
    255     vdup.8      d21, d4[2]                  @(6)
    256     vmlal.u8    q8, d8, d1                  @(5)
    257 
    258     vdup.16     q9, r4                      @(6)
    259     vmlal.u8    q8, d6, d3                  @(5)
    260 
    261     vst1.s8     d28, [r2], r3               @(3)str 8 values
    262     vmlal.u8    q8, d9, d20                 @(5)
    263 
    264     vshl.s16    q5, q5, q7                  @(4)shr
    265     vadd.s8     d5, d5, d7                  @(5)
    266     vsub.s8     d6, d6, d7                  @(5)
    267 
    268     vmovn.i16   d10, q5                     @(4)
    269     vmlal.u8    q9, d5, d0                  @(6)
    270 
    271     vdup.8      d22, d4[1]                  @(7)
    272     vmlal.u8    q9, d8, d1                  @(6)
    273 
    274     vdup.16     q13, r4                     @(7)
    275     vmlal.u8    q9, d6, d3                  @(6)
    276 
    277     vst1.s8     d10, [r2], r3               @(4)str 8 values
    278     vmlal.u8    q9, d9, d21                 @(6)
    279 
    280     vshl.s16    q8, q8, q7                  @(5)shr
    281 
    282     vadd.s8     d5, d5, d7                  @(6)
    283     vsub.s8     d6, d6, d7                  @(6)
    284 
    285     vmovn.i16   d16, q8                     @(5)
    286     vmlal.u8    q13, d5, d0                 @(7)
    287 
    288     vdup.8      d23, d4[0]                  @(8)
    289     vmlal.u8    q13, d8, d1                 @(7)
    290 
    291     vdup.16     q12, r4                     @(8)
    292     vmlal.u8    q13, d6, d3                 @(7)
    293 
    294     vst1.s8     d16, [r2], r3               @(5)str 8 values
    295     vmlal.u8    q13, d9, d22                @(7)
    296 
    297     vshl.s16    q9, q9, q7                  @(6)shr
    298 
    299     vadd.s8     d5, d5, d7                  @(7)
    300     vsub.s8     d6, d6, d7                  @(7)
    301 
    302     vmovn.i16   d18, q9                     @(6)
    303     vmlal.u8    q12, d5, d0                 @(8)
    304 
    305 
    306     vmlal.u8    q12, d8, d1                 @(8)
    307 
    308     vmlal.u8    q12, d6, d3                 @(8)
    309 
    310     vst1.s8     d18, [r2], r3               @(6)str 8 values
    311     vmlal.u8    q12, d9, d23                @(8)
    312 
    313     vshl.s16    q13, q13, q7                @(7)shr
    314 
    315     subs        r7, r7, #8
    316 
    317     beq         epilog
    318 
    319     subs        r1, r1, #8                  @row counter
    320     addgt       r12, r12, #8                @col inc
    321     addgt       r14, r14, #8                @also for col inc
    322     movle       r1, r4                      @nt reloaded (refresh the value)
    323     addle       r12, r11, #1                @r12 reset
    324 
    325     movle       r14, r0                     @r14 reset
    326     vld1.s8     d8, [r12]                   @(1n)(1-8)load 8 coeffs [col+1]
    327 
    328     suble       r6, r6, #8                  @for next set of rows
    329     vld1.s8     d3, [r14]                   @(1n)(1-8)load 8 src[2nt+1+col]
    330 
    331     addle       r5, r5, #8
    332     vdup.16     q6, r4                      @(1n)(1)
    333 
    334     vld1.s8     d5, [r5]
    335 
    336     vld1.s8     d4, [r6]                    @(1n)(1-8)src[2nt-1-row]
    337     vsub.s8     d9, d2, d8                  @(1n)(1-8)[nt-1-col]
    338 
    339     vdup.s8     d20, d4[7]                  @(1n)(1)
    340     vsub.s8     d6, d2, d5
    341 
    342     beq         epilog
    343 
    344 kernel_plnr:
    345 
    346     cmp         r1, #0                      @ (cond loop)
    347     vshl.s16    q12, q12, q7                @(8)shr
    348 
    349     vmovn.i16   d26, q13                    @(7)
    350     vmlal.u8    q6, d5, d0                  @(1)(row+1) *   src[nt-1]
    351 
    352     vmovn.i16   d24, q12                    @(8)
    353     vmlal.u8    q6, d8, d1                  @(1)(col+1) *   src[3nt+1]
    354 
    355     vdup.s8     d21, d4[6]                  @(2)
    356     vmlal.u8    q6, d6, d3                  @(1)(nt-1-row)  *   src[2nt+1+col]
    357 
    358     vdup.16     q15, r4                     @(2)
    359     vmlal.u8    q6, d9, d20                 @(1)(nt-1-col)  *   src[2nt-1-row]
    360 
    361     vst1.s8     d26, [r2], r3               @(7)str 8 values
    362     vadd.s8     d5, d5, d7                  @(1)
    363 
    364     vst1.s8     d24, [r2], r3               @(8)str 8 values
    365     vsub.s8     d6, d6, d7                  @(1)
    366 
    367     addgt       r2, r2, r9                  @since more cols to fill, dst + 8 - 6*strd (cond loop)
    368     vmlal.u8    q15, d5, d0                 @(2)
    369 
    370     suble       r2, r2, r10                 @else go to next set of rows, dst - (nt-8) (cond loop)
    371     vmlal.u8    q15, d8, d1                 @(2)
    372 
    373     vdup.s8     d22, d4[5]                  @(3)
    374     vmlal.u8    q15, d6, d3                 @(2)
    375 
    376     vdup.16     q14, r4                     @(3)
    377     vmlal.u8    q15, d9, d21                @(2)
    378 
    379     vshl.s16    q6, q6, q7                  @(1)shr
    380 
    381     vadd.s8     d5, d5, d7                  @(2)
    382     movle       r1, r4                      @nt reloaded (refresh the value)    (cond loop)
    383 
    384     vsub.s8     d6, d6, d7                  @(2)
    385     subs        r1, r1, #8                  @row counter (loop)
    386 
    387     vmovn.i16   d12, q6                     @(1)
    388     vmlal.u8    q14, d5, d0                 @(3)
    389 
    390     vdup.8      d23, d4[4]                  @(4)
    391     vmlal.u8    q14, d8, d1                 @(3)
    392 
    393     vdup.16     q5, r4                      @(4)
    394     vmlal.u8    q14, d6, d3                 @(3)
    395 
    396     vst1.s8     d12, [r2], r3               @(1)str 8 values
    397     vmlal.u8    q14, d9, d22                @(3)
    398 
    399     vshl.s16    q15, q15, q7                @(2)shr
    400 
    401     vadd.s8     d5, d5, d7                  @(3)
    402 
    403     vsub.s8     d6, d6, d7                  @(3)
    404 
    405     vmovn.i16   d30, q15                    @(2)
    406     vmlal.u8    q5, d5, d0                  @(4)
    407 
    408     vdup.8      d20, d4[3]                  @(5)
    409     vmlal.u8    q5, d8, d1                  @(4)
    410 
    411     vdup.16     q8, r4                      @(5)
    412     vmlal.u8    q5, d6, d3                  @(4)
    413 
    414     vst1.s8     d30, [r2], r3               @(2)str 8 values
    415     vmlal.u8    q5, d9, d23                 @(4)
    416 
    417     vshl.s16    q14, q14, q7                @(3)shr
    418 
    419     vadd.s8     d5, d5, d7                  @(4)
    420 
    421     vsub.s8     d6, d6, d7                  @(4)
    422 
    423     vmovn.i16   d28, q14                    @(3)
    424     vmlal.u8    q8, d5, d0                  @(5)
    425 
    426     vdup.8      d21, d4[2]                  @(6)
    427     vmlal.u8    q8, d8, d1                  @(5)
    428 
    429     vdup.16     q9, r4                      @(6)
    430     vmlal.u8    q8, d6, d3                  @(5)
    431 
    432     vst1.s8     d28, [r2], r3               @(3)str 8 values
    433     vmlal.u8    q8, d9, d20                 @(5)
    434 
    435     addle       r12, r11, #1                @r12 reset (cond loop)
    436     vshl.s16    q5, q5, q7                  @(4)shr
    437 
    438     addgt       r12, r12, #8                @col inc (cond loop)
    439     vadd.s8     d5, d5, d7                  @(5)
    440 
    441     addgt       r14, r14, #8                @also for col inc (cond loop)
    442     vsub.s8     d6, d6, d7                  @(5)
    443 
    444     vmovn.i16   d10, q5                     @(4)
    445     vmlal.u8    q9, d5, d0                  @(6)
    446 
    447     vdup.8      d22, d4[1]                  @(7)
    448     vmlal.u8    q9, d8, d1                  @(6)
    449 
    450     vdup.16     q13, r4                     @(7)
    451     vmlal.u8    q9, d6, d3                  @(6)
    452 
    453     vst1.s8     d10, [r2], r3               @(4)str 8 values
    454     vmlal.u8    q9, d9, d21                 @(6)
    455 
    456     movle       r14, r0                     @r14 reset (cond loop)
    457     vshl.s16    q8, q8, q7                  @(5)shr
    458 
    459     suble       r6, r6, #8                  @for next set of rows (cond loop)
    460     vadd.s8     d5, d5, d7                  @(6)
    461 
    462     addle       r5, r5, #8                  @ (cond loop)
    463     vsub.s8     d6, d6, d7                  @(6)
    464 
    465     vmovn.i16   d16, q8                     @(5)
    466     vmlal.u8    q13, d5, d0                 @(7)
    467 
    468     vdup.8      d23, d4[0]                  @(8)
    469     vmlal.u8    q13, d8, d1                 @(7)
    470 
    471     vdup.16     q12, r4                     @(8)
    472     vmlal.u8    q13, d6, d3                 @(7)
    473 
    474     vst1.s8     d16, [r2], r3               @(5)str 8 values
    475     vmlal.u8    q13, d9, d22                @(7)
    476 
    477     vld1.s8     d4, [r6]                    @(1n)(1-8)src[2nt-1-row]
    478     vshl.s16    q9, q9, q7                  @(6)shr
    479 
    480     vadd.s8     d5, d5, d7                  @(7)
    481 
    482     vsub.s8     d6, d6, d7                  @(7)
    483 
    484     vmovn.i16   d18, q9                     @(6)
    485     vmlal.u8    q12, d5, d0                 @(8)
    486 
    487     vld1.s8     d5, [r5]                    @(row+1 value)
    488     vmlal.u8    q12, d8, d1                 @(8)
    489 
    490     vdup.s8     d20, d4[7]                  @(1n)(1)
    491     vmlal.u8    q12, d6, d3                 @(8)
    492 
    493     vst1.s8     d18, [r2], r3               @(6)str 8 values
    494     vmlal.u8    q12, d9, d23                @(8)
    495 
    496     vld1.s8     d8, [r12]                   @(1n)(1-8)load 8 coeffs [col+1]
    497     vsub.s8     d6, d2, d5                  @(nt-1-row) value
    498 
    499     subs        r7, r7, #8                  @col counter
    500 
    501     vld1.s8     d3, [r14]                   @(1n)(1-8)load 8 src[2nt+1+col]
    502     vshl.s16    q13, q13, q7                @(7)shr
    503 
    504     vdup.16     q6, r4                      @(1n)(1)
    505     vsub.s8     d9, d2, d8                  @(1n)(1-8)[nt-1-col]
    506 
    507     bne         kernel_plnr
    508 
    509 epilog:
    510 
    511     vmovn.i16   d26, q13                    @(7)
    512     vst1.s8     d26, [r2], r3               @(7)str 8 values
    513 
    514     vshl.s16    q12, q12, q7                @(8)shr
    515     vmovn.i16   d24, q12                    @(8)
    516     vst1.s8     d24, [r2], r3               @(8)str 8 values
    517 
    518 @@ ========== ***************** =====================
    519 
    520     beq         end_loop
    521 
    522 tf_sz_4:
    523     vld1.s8     d10, [r14]                  @load src[2nt+1+col]
    524     vld1.s8     d8, [r12], r10              @load 8 coeffs [col+1]
    525 loop_sz_4:
    526     mov         r10, #4                     @reduce inc to #4 for 4x4
    527     ldr         r7, [r6], #-1               @src[2nt-1-row] (dec to take into account row)
    528     vdup.s8     d4, r7                      @src[2nt-1-row]
    529 
    530     vsub.s8     d9, d2, d8                  @[nt-1-col]
    531 
    532     vmull.u8    q6, d5, d0                  @(row+1)    *   src[nt-1]
    533     vmlal.u8    q6, d6, d10                 @(nt-1-row) *   src[2nt+1+col]
    534     vmlal.u8    q6, d8, d1                  @(col+1)    *   src[3nt+1]
    535     vmlal.u8    q6, d9, d4                  @(nt-1-col) *   src[2nt-1-row]
    536 @   vadd.i16    q6, q6, q8          @add (nt)
    537 @   vshl.s16    q6, q6, q7          @shr
    538 @   vmovn.i16   d12, q6
    539     vrshrn.s16  d12,q6,#3
    540     vst1.s32    {d12[0]}, [r2], r3
    541 
    542     vadd.s8     d5, d5, d7                  @row++ [(row+1)++]
    543     vsub.s8     d6, d6, d7                  @[nt-1-row]--
    544     subs        r1, r1, #1
    545 
    546     bne         loop_sz_4
    547 
    548 end_loop:
    549     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    550 
    551 
    552 
    553 
    554 
    555 
    556 
    557 
    558