Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* @file
     21 @*  ihevc_intra_pred_filters_planar.s
     22 @*
     23 @* @brief
     24 @*  contains function definitions for inter prediction  interpolation.
     25 @* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 @* rvct
     28 @*
     29 @* @author
     30 @*  akshaya mukund
     31 @*
     32 @* @par list of functions:
     33 @*
     34 @*
     35 @* @remarks
     36 @*  none
     37 @*
     38 @*******************************************************************************
     39 @*/
     40 @/**
     41 @*******************************************************************************
     42 @*
     43 @* @brief
     44 @*    luma intraprediction filter for planar input
     45 @*
     46 @* @par description:
     47 @*
     48 @* @param[in] pu1_ref
     49 @*  uword8 pointer to the source
     50 @*
     51 @* @param[out] pu1_dst
     52 @*  uword8 pointer to the destination
     53 @*
     54 @* @param[in] src_strd
     55 @*  integer source stride
     56 @*
     57 @* @param[in] dst_strd
     58 @*  integer destination stride
     59 @*
     60 @* @param[in] pi1_coeff
     61 @*  word8 pointer to the planar coefficients
     62 @*
     63 @* @param[in] nt
     64 @*  size of tranform block
     65 @*
     66 @* @param[in] mode
     67 @*  type of filtering
     68 @*
     69 @* @returns
     70 @*
     71 @* @remarks
     72 @*  none
     73 @*
     74 @*******************************************************************************
     75 @*/
     76 
     77 @void ihevc_intra_pred_luma_planar(uword8* pu1_ref,
     78 @                                  word32 src_strd,
     79 @                                  uword8* pu1_dst,
     80 @                                  word32 dst_strd,
     81 @                                  word32 nt,
     82 @                                  word32 mode,
     83 @                  word32 pi1_coeff)
     84 @**************variables vs registers*****************************************
     85 @r0 => *pu1_ref
     86 @r1 => src_strd
     87 @r2 => *pu1_dst
     88 @r3 => dst_strd
     89 
     90 @stack contents from #104
     91 @   nt
     92 @   mode
     93 @   pi1_coeff
     94 
     95 .equ    nt_offset,      104
     96 
     97 .text
     98 .align 4
     99 
    100 
    101 
    102 
    103 .globl ihevc_intra_pred_luma_planar_a9q
    104 .extern gau1_ihevc_planar_factor
    105 .extern gau1_ihevc_planar_factor_1
    106 
    107 gau1_ihevc_planar_factor_addr:
    108 .long gau1_ihevc_planar_factor - ulbl1 - 8
    109 
    110 gau1_ihevc_planar_factor_1_addr:
    111 .long gau1_ihevc_planar_factor_1 - ulbl2 - 8
    112 
    113 
    114 .type ihevc_intra_pred_luma_planar_a9q, %function
    115 
    116 ihevc_intra_pred_luma_planar_a9q:
    117 
    118     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    119     vpush       {d8 - d15}
    120     ldr         r4,[sp,#nt_offset]          @loads nt
    121     ldr         r11, gau1_ihevc_planar_factor_addr @loads table of coeffs
    122 ulbl1:
    123     add         r11,r11,pc
    124 
    125     clz         r5, r4
    126     rsb         r5, r5, #32
    127     vdup.16     q7, r5
    128     vneg.s16    q7, q7                      @shr value (so vneg)
    129     vdup.8      d2, r4                      @nt
    130     vdup.s16    q8, r4                      @nt
    131 
    132     sub         r6, r4, #1                  @nt-1
    133     add         r6, r6, r0
    134     ldr         r7, [r6]
    135     vdup.s8     d0, r7                      @src[nt-1]
    136 
    137     add         r6, r4, r4,lsl #1           @3nt
    138     add         r6, r6, #1                  @3nt + 1
    139     add         r6, r6, r0
    140     ldr         r7, [r6]
    141     vdup.s8     d1, r7                      @src[3nt+1]
    142 
    143     add         r6, r4, r4                  @2nt
    144     add         r14, r6, #1                 @2nt+1
    145     sub         r6, r6, #1                  @2nt-1
    146     add         r6, r6, r0                  @&src[2nt-1]
    147     add         r14, r14, r0                @&src[2nt+1]
    148 
    149     mov         r8, #1                      @row+1 (row is first 0)
    150     sub         r9, r4, r8                  @nt-1-row (row is first 0)
    151 
    152     vdup.s8     d5, r8                      @row + 1
    153     vdup.s8     d6, r9                      @nt - 1 - row
    154     vmov        d7, d5                      @mov #1 to d7 to used for inc for row+1 and dec for nt-1-row
    155 
    156     add         r12, r11, #1                @coeffs (to be reloaded after every row)
    157     mov         r1, r4                      @nt (row counter) (dec after every row)
    158     mov         r5, r2                      @dst (to be reloaded after every row and inc by dst_strd)
    159     mov         r10, #8                     @increment for the coeffs
    160     mov         r0, r14                     @&src[2nt+1] (to be reloaded after every row)
    161 
    162     cmp         r4, #4
    163     beq         tf_sz_4
    164 
    165 @@ ========== ***************** =====================
    166 prolog:
    167 tf_sz_8_16_32:
    168 
    169     mov         r7, r4                      @column counter (set to no of cols)
    170     mov         r9, r4, lsr #3              @divide nt by 8
    171     mul         r7, r7, r9                  @multiply width * height
    172     ldr         r5, gau1_ihevc_planar_factor_1_addr @loads table of coeffs
    173 ulbl2:
    174     add         r5,r5,pc
    175     sub         r6, r6, #7
    176     mov         r8, r2
    177     lsl         r9, r3, #3                  @4*stride
    178     rsb         r9, r9, #8                  @8-4*stride
    179     mov         r10, r4                     @nt
    180     sub         r10, r10, #8                @nt - 8
    181 
    182 col_loop_8_16_32:
    183 
    184     vld1.s8     d8, [r12]                   @(1-8)load 8 coeffs [col+1]
    185     vdup.16     q6, r4                      @(1)
    186     vld1.s8     d4, [r6]                    @(1-8)src[2nt-1-row]
    187     vsub.s8     d9, d2, d8                  @(1-8)[nt-1-col]
    188 
    189 
    190     vmlal.u8    q6, d5, d0                  @(1)(row+1) *   src[nt-1]
    191 
    192     vld1.s8     d3, [r14]                   @(1-8)load 8 src[2nt+1+col]
    193     vmlal.u8    q6, d8, d1                  @(1)(col+1) *   src[3nt+1]
    194 
    195     vdup.s8     d20, d4[7]                  @(1)
    196     vmlal.u8    q6, d6, d3                  @(1)(nt-1-row)  *   src[2nt+1+col]
    197 
    198     vdup.s8     d21, d4[6]                  @(2)
    199     vmlal.u8    q6, d9, d20                 @(1)(nt-1-col)  *   src[2nt-1-row]
    200 
    201     vdup.16     q15, r4                     @(2)
    202     vadd.s8     d5, d5, d7                  @(1)
    203 
    204     vsub.s8     d6, d6, d7                  @(1)
    205 
    206     vdup.s8     d22, d4[5]                  @(3)
    207     vmlal.u8    q15, d5, d0                 @(2)
    208 
    209     vdup.16     q14, r4                     @(3)
    210     vmlal.u8    q15, d8, d1                 @(2)
    211 
    212     vmlal.u8    q15, d6, d3                 @(2)
    213     vmlal.u8    q15, d9, d21                @(2)
    214 
    215     vshl.s16    q6, q6, q7                  @(1)shr
    216 
    217     vadd.s8     d5, d5, d7                  @(2)
    218     vsub.s8     d6, d6, d7                  @(2)
    219 
    220     vmovn.i16   d12, q6                     @(1)
    221     vmlal.u8    q14, d5, d0                 @(3)
    222 
    223     vdup.8      d23, d4[4]                  @(4)
    224     vmlal.u8    q14, d8, d1                 @(3)
    225 
    226     vdup.16     q5, r4                      @(4)
    227     vmlal.u8    q14, d6, d3                 @(3)
    228 
    229     vst1.s8     d12, [r2], r3               @(1)str 8 values
    230     vmlal.u8    q14, d9, d22                @(3)
    231 
    232     vshl.s16    q15, q15, q7                @(2)shr
    233 
    234     vadd.s8     d5, d5, d7                  @(3)
    235     vsub.s8     d6, d6, d7                  @(3)
    236 
    237     vmovn.i16   d30, q15                    @(2)
    238     vmlal.u8    q5, d5, d0                  @(4)
    239 
    240     vdup.8      d20, d4[3]                  @(5)
    241     vmlal.u8    q5, d8, d1                  @(4)
    242 
    243     vdup.16     q8, r4                      @(5)
    244     vmlal.u8    q5, d6, d3                  @(4)
    245 
    246     vst1.s8     d30, [r2], r3               @(2)str 8 values
    247     vmlal.u8    q5, d9, d23                 @(4)
    248 
    249     vshl.s16    q14, q14, q7                @(3)shr
    250 
    251     vadd.s8     d5, d5, d7                  @(4)
    252     vsub.s8     d6, d6, d7                  @(4)
    253 
    254     vmovn.i16   d28, q14                    @(3)
    255     vmlal.u8    q8, d5, d0                  @(5)
    256 
    257     vdup.8      d21, d4[2]                  @(6)
    258     vmlal.u8    q8, d8, d1                  @(5)
    259 
    260     vdup.16     q9, r4                      @(6)
    261     vmlal.u8    q8, d6, d3                  @(5)
    262 
    263     vst1.s8     d28, [r2], r3               @(3)str 8 values
    264     vmlal.u8    q8, d9, d20                 @(5)
    265 
    266     vshl.s16    q5, q5, q7                  @(4)shr
    267     vadd.s8     d5, d5, d7                  @(5)
    268     vsub.s8     d6, d6, d7                  @(5)
    269 
    270     vmovn.i16   d10, q5                     @(4)
    271     vmlal.u8    q9, d5, d0                  @(6)
    272 
    273     vdup.8      d22, d4[1]                  @(7)
    274     vmlal.u8    q9, d8, d1                  @(6)
    275 
    276     vdup.16     q13, r4                     @(7)
    277     vmlal.u8    q9, d6, d3                  @(6)
    278 
    279     vst1.s8     d10, [r2], r3               @(4)str 8 values
    280     vmlal.u8    q9, d9, d21                 @(6)
    281 
    282     vshl.s16    q8, q8, q7                  @(5)shr
    283 
    284     vadd.s8     d5, d5, d7                  @(6)
    285     vsub.s8     d6, d6, d7                  @(6)
    286 
    287     vmovn.i16   d16, q8                     @(5)
    288     vmlal.u8    q13, d5, d0                 @(7)
    289 
    290     vdup.8      d23, d4[0]                  @(8)
    291     vmlal.u8    q13, d8, d1                 @(7)
    292 
    293     vdup.16     q12, r4                     @(8)
    294     vmlal.u8    q13, d6, d3                 @(7)
    295 
    296     vst1.s8     d16, [r2], r3               @(5)str 8 values
    297     vmlal.u8    q13, d9, d22                @(7)
    298 
    299     vshl.s16    q9, q9, q7                  @(6)shr
    300 
    301     vadd.s8     d5, d5, d7                  @(7)
    302     vsub.s8     d6, d6, d7                  @(7)
    303 
    304     vmovn.i16   d18, q9                     @(6)
    305     vmlal.u8    q12, d5, d0                 @(8)
    306 
    307 
    308     vmlal.u8    q12, d8, d1                 @(8)
    309 
    310     vmlal.u8    q12, d6, d3                 @(8)
    311 
    312     vst1.s8     d18, [r2], r3               @(6)str 8 values
    313     vmlal.u8    q12, d9, d23                @(8)
    314 
    315     vshl.s16    q13, q13, q7                @(7)shr
    316 
    317     subs        r7, r7, #8
    318 
    319     beq         epilog
    320 
    321     subs        r1, r1, #8                  @row counter
    322     addgt       r12, r12, #8                @col inc
    323     addgt       r14, r14, #8                @also for col inc
    324     movle       r1, r4                      @nt reloaded (refresh the value)
    325     addle       r12, r11, #1                @r12 reset
    326 
    327     movle       r14, r0                     @r14 reset
    328     vld1.s8     d8, [r12]                   @(1n)(1-8)load 8 coeffs [col+1]
    329 
    330     suble       r6, r6, #8                  @for next set of rows
    331     vld1.s8     d3, [r14]                   @(1n)(1-8)load 8 src[2nt+1+col]
    332 
    333     addle       r5, r5, #8
    334     vdup.16     q6, r4                      @(1n)(1)
    335 
    336     vld1.s8     d5, [r5]
    337 
    338     vld1.s8     d4, [r6]                    @(1n)(1-8)src[2nt-1-row]
    339     vsub.s8     d9, d2, d8                  @(1n)(1-8)[nt-1-col]
    340 
    341     vdup.s8     d20, d4[7]                  @(1n)(1)
    342     vsub.s8     d6, d2, d5
    343 
    344     beq         epilog
    345 
    346 kernel_plnr:
    347 
    348     cmp         r1, #0                      @ (cond loop)
    349     vshl.s16    q12, q12, q7                @(8)shr
    350 
    351     vmovn.i16   d26, q13                    @(7)
    352     vmlal.u8    q6, d5, d0                  @(1)(row+1) *   src[nt-1]
    353 
    354     vmovn.i16   d24, q12                    @(8)
    355     vmlal.u8    q6, d8, d1                  @(1)(col+1) *   src[3nt+1]
    356 
    357     vdup.s8     d21, d4[6]                  @(2)
    358     vmlal.u8    q6, d6, d3                  @(1)(nt-1-row)  *   src[2nt+1+col]
    359 
    360     vdup.16     q15, r4                     @(2)
    361     vmlal.u8    q6, d9, d20                 @(1)(nt-1-col)  *   src[2nt-1-row]
    362 
    363     vst1.s8     d26, [r2], r3               @(7)str 8 values
    364     vadd.s8     d5, d5, d7                  @(1)
    365 
    366     vst1.s8     d24, [r2], r3               @(8)str 8 values
    367     vsub.s8     d6, d6, d7                  @(1)
    368 
    369     addgt       r2, r2, r9                  @since more cols to fill, dst + 8 - 6*strd (cond loop)
    370     vmlal.u8    q15, d5, d0                 @(2)
    371 
    372     suble       r2, r2, r10                 @else go to next set of rows, dst - (nt-8) (cond loop)
    373     vmlal.u8    q15, d8, d1                 @(2)
    374 
    375     vdup.s8     d22, d4[5]                  @(3)
    376     vmlal.u8    q15, d6, d3                 @(2)
    377 
    378     vdup.16     q14, r4                     @(3)
    379     vmlal.u8    q15, d9, d21                @(2)
    380 
    381     vshl.s16    q6, q6, q7                  @(1)shr
    382 
    383     vadd.s8     d5, d5, d7                  @(2)
    384     movle       r1, r4                      @nt reloaded (refresh the value)    (cond loop)
    385 
    386     vsub.s8     d6, d6, d7                  @(2)
    387     subs        r1, r1, #8                  @row counter (loop)
    388 
    389     vmovn.i16   d12, q6                     @(1)
    390     vmlal.u8    q14, d5, d0                 @(3)
    391 
    392     vdup.8      d23, d4[4]                  @(4)
    393     vmlal.u8    q14, d8, d1                 @(3)
    394 
    395     vdup.16     q5, r4                      @(4)
    396     vmlal.u8    q14, d6, d3                 @(3)
    397 
    398     vst1.s8     d12, [r2], r3               @(1)str 8 values
    399     vmlal.u8    q14, d9, d22                @(3)
    400 
    401     vshl.s16    q15, q15, q7                @(2)shr
    402 
    403     vadd.s8     d5, d5, d7                  @(3)
    404 
    405     vsub.s8     d6, d6, d7                  @(3)
    406 
    407     vmovn.i16   d30, q15                    @(2)
    408     vmlal.u8    q5, d5, d0                  @(4)
    409 
    410     vdup.8      d20, d4[3]                  @(5)
    411     vmlal.u8    q5, d8, d1                  @(4)
    412 
    413     vdup.16     q8, r4                      @(5)
    414     vmlal.u8    q5, d6, d3                  @(4)
    415 
    416     vst1.s8     d30, [r2], r3               @(2)str 8 values
    417     vmlal.u8    q5, d9, d23                 @(4)
    418 
    419     vshl.s16    q14, q14, q7                @(3)shr
    420 
    421     vadd.s8     d5, d5, d7                  @(4)
    422 
    423     vsub.s8     d6, d6, d7                  @(4)
    424 
    425     vmovn.i16   d28, q14                    @(3)
    426     vmlal.u8    q8, d5, d0                  @(5)
    427 
    428     vdup.8      d21, d4[2]                  @(6)
    429     vmlal.u8    q8, d8, d1                  @(5)
    430 
    431     vdup.16     q9, r4                      @(6)
    432     vmlal.u8    q8, d6, d3                  @(5)
    433 
    434     vst1.s8     d28, [r2], r3               @(3)str 8 values
    435     vmlal.u8    q8, d9, d20                 @(5)
    436 
    437     addle       r12, r11, #1                @r12 reset (cond loop)
    438     vshl.s16    q5, q5, q7                  @(4)shr
    439 
    440     addgt       r12, r12, #8                @col inc (cond loop)
    441     vadd.s8     d5, d5, d7                  @(5)
    442 
    443     addgt       r14, r14, #8                @also for col inc (cond loop)
    444     vsub.s8     d6, d6, d7                  @(5)
    445 
    446     vmovn.i16   d10, q5                     @(4)
    447     vmlal.u8    q9, d5, d0                  @(6)
    448 
    449     vdup.8      d22, d4[1]                  @(7)
    450     vmlal.u8    q9, d8, d1                  @(6)
    451 
    452     vdup.16     q13, r4                     @(7)
    453     vmlal.u8    q9, d6, d3                  @(6)
    454 
    455     vst1.s8     d10, [r2], r3               @(4)str 8 values
    456     vmlal.u8    q9, d9, d21                 @(6)
    457 
    458     movle       r14, r0                     @r14 reset (cond loop)
    459     vshl.s16    q8, q8, q7                  @(5)shr
    460 
    461     suble       r6, r6, #8                  @for next set of rows (cond loop)
    462     vadd.s8     d5, d5, d7                  @(6)
    463 
    464     addle       r5, r5, #8                  @ (cond loop)
    465     vsub.s8     d6, d6, d7                  @(6)
    466 
    467     vmovn.i16   d16, q8                     @(5)
    468     vmlal.u8    q13, d5, d0                 @(7)
    469 
    470     vdup.8      d23, d4[0]                  @(8)
    471     vmlal.u8    q13, d8, d1                 @(7)
    472 
    473     vdup.16     q12, r4                     @(8)
    474     vmlal.u8    q13, d6, d3                 @(7)
    475 
    476     vst1.s8     d16, [r2], r3               @(5)str 8 values
    477     vmlal.u8    q13, d9, d22                @(7)
    478 
    479     vld1.s8     d4, [r6]                    @(1n)(1-8)src[2nt-1-row]
    480     vshl.s16    q9, q9, q7                  @(6)shr
    481 
    482     vadd.s8     d5, d5, d7                  @(7)
    483 
    484     vsub.s8     d6, d6, d7                  @(7)
    485 
    486     vmovn.i16   d18, q9                     @(6)
    487     vmlal.u8    q12, d5, d0                 @(8)
    488 
    489     vld1.s8     d5, [r5]                    @(row+1 value)
    490     vmlal.u8    q12, d8, d1                 @(8)
    491 
    492     vdup.s8     d20, d4[7]                  @(1n)(1)
    493     vmlal.u8    q12, d6, d3                 @(8)
    494 
    495     vst1.s8     d18, [r2], r3               @(6)str 8 values
    496     vmlal.u8    q12, d9, d23                @(8)
    497 
    498     vld1.s8     d8, [r12]                   @(1n)(1-8)load 8 coeffs [col+1]
    499     vsub.s8     d6, d2, d5                  @(nt-1-row) value
    500 
    501     subs        r7, r7, #8                  @col counter
    502 
    503     vld1.s8     d3, [r14]                   @(1n)(1-8)load 8 src[2nt+1+col]
    504     vshl.s16    q13, q13, q7                @(7)shr
    505 
    506     vdup.16     q6, r4                      @(1n)(1)
    507     vsub.s8     d9, d2, d8                  @(1n)(1-8)[nt-1-col]
    508 
    509     bne         kernel_plnr
    510 
    511 epilog:
    512 
    513     vmovn.i16   d26, q13                    @(7)
    514     vst1.s8     d26, [r2], r3               @(7)str 8 values
    515 
    516     vshl.s16    q12, q12, q7                @(8)shr
    517     vmovn.i16   d24, q12                    @(8)
    518     vst1.s8     d24, [r2], r3               @(8)str 8 values
    519 
    520 @@ ========== ***************** =====================
    521 
    522     beq         end_loop
    523 
    524 tf_sz_4:
    525     vld1.s8     d10, [r14]                  @load src[2nt+1+col]
    526     vld1.s8     d8, [r12], r10              @load 8 coeffs [col+1]
    527 loop_sz_4:
    528     mov         r10, #4                     @reduce inc to #4 for 4x4
    529     ldr         r7, [r6], #-1               @src[2nt-1-row] (dec to take into account row)
    530     vdup.s8     d4, r7                      @src[2nt-1-row]
    531 
    532     vsub.s8     d9, d2, d8                  @[nt-1-col]
    533 
    534     vmull.u8    q6, d5, d0                  @(row+1)    *   src[nt-1]
    535     vmlal.u8    q6, d6, d10                 @(nt-1-row) *   src[2nt+1+col]
    536     vmlal.u8    q6, d8, d1                  @(col+1)    *   src[3nt+1]
    537     vmlal.u8    q6, d9, d4                  @(nt-1-col) *   src[2nt-1-row]
    538 @   vadd.i16    q6, q6, q8          @add (nt)
    539 @   vshl.s16    q6, q6, q7          @shr
    540 @   vmovn.i16   d12, q6
    541     vrshrn.s16  d12,q6,#3
    542     vst1.s32    {d12[0]}, [r2], r3
    543 
    544     vadd.s8     d5, d5, d7                  @row++ [(row+1)++]
    545     vsub.s8     d6, d6, d7                  @[nt-1-row]--
    546     subs        r1, r1, #1
    547 
    548     bne         loop_sz_4
    549 
    550 end_loop:
    551     vpop        {d8 - d15}
    552     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    553 
    554 
    555 
    556 
    557 
    558 
    559 
    560 
    561