Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* @file
     21 @*  ihevc_intra_pred_filters_dc.s
     22 @*
     23 @* @brief
     24 @*  contains function definitions for intra prediction dc filtering.
     25 @* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 @* rvct
     28 @*
     29 @* @author
     30 @*  akshaya mukund
     31 @*
     32 @* @par list of functions:
     33 @*
     34 @*
     35 @* @remarks
     36 @*  none
     37 @*
     38 @*******************************************************************************
     39 @*/
     40 @/**
     41 @*******************************************************************************
     42 @*
     43 @* @brief
     44 @*    luma intraprediction filter for dc input
     45 @*
     46 @* @par description:
     47 @*
     48 @* @param[in] pu1_ref
     49 @*  uword8 pointer to the source
     50 @*
     51 @* @param[out] pu1_dst
     52 @*  uword8 pointer to the destination
     53 @*
     54 @* @param[in] src_strd
     55 @*  integer source stride
     56 @*
     57 @* @param[in] dst_strd
     58 @*  integer destination stride
     59 @*
     60 @* @param[in] pi1_coeff
     61 @*  word8 pointer to the planar coefficients
     62 @*
     63 @* @param[in] nt
     64 @*  size of tranform block
     65 @*
     66 @* @param[in] mode
     67 @*  type of filtering
     68 @*
     69 @* @returns
     70 @*
     71 @* @remarks
     72 @*  none
     73 @*
     74 @*******************************************************************************
     75 @*/
     76 
     77 @void ihevc_intra_pred_luma_dc(uword8 *pu1_ref,
     78 @                              word32 src_strd,
     79 @                              uword8 *pu1_dst,
     80 @                              word32 dst_strd,
     81 @                              word32 nt,
     82 @                              word32 mode)
     83 @
     84 @**************variables vs registers*****************************************
     85 @r0 => *pu1_ref
     86 @r1 => src_strd
     87 @r2 => *pu1_dst
     88 @r3 => dst_strd
     89 
     90 @stack contents from #40
     91 @   nt
     92 @   mode
     93 @   pi1_coeff
     94 
     95 .text
     96 .align 4
     97 
     98 
     99 
    100 
    101 .globl ihevc_intra_pred_luma_dc_a9q
    102 
    103 .type ihevc_intra_pred_luma_dc_a9q, %function
    104 
    105 ihevc_intra_pred_luma_dc_a9q:
    106 
    107     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    108 
    109     ldr         r4,[sp,#40]                 @loads nt
    110 
    111 @********** testing
    112     @mov        r6, #128
    113     @b      prologue_cpy_32
    114 @********** testing
    115 
    116     mov         r11, #2                     @mov #2 to r11 (to be used to add to 2dc_val & 3dc_val)
    117     mov         r9, #0
    118     vmov        d17, r11, r9
    119 
    120     clz         r5, r4
    121 
    122     add         r6, r0, r4                  @&src[nt]
    123     rsb         r5, r5, #32                 @log2nt
    124     add         r7, r0, r4, lsl #1          @&src[2nt]
    125 
    126     add         r8, r7, #1                  @&src[2nt+1]
    127     mvn         r5, r5
    128     add         r5, r5, #1
    129     vdup.32     d8, r5
    130 
    131     ldrb        r14, [r8]
    132     vshl.i64    d8, d8, #32
    133 
    134     sub         r9, r7, #1                  @&src[2nt-1]
    135     vshr.s64    d8, d8, #32
    136 
    137     mov         r7, r8                      @r7 also stores 2nt+1
    138 
    139     ldrb        r12, [r9]
    140     add         r14, r14, r12               @src[2nt+1] + src[2nt-1]
    141     add         r14, r14, r11               @src[2nt+1] + src[2nt-1] + 2
    142 
    143     cmp         r4, #4
    144     beq         dc_4
    145 
    146     mov         r10, r4                     @nt
    147 
    148 add_loop:
    149     vld1.s8     d0, [r6]!                   @load from src[nt]
    150     mov         r5, #0                      @
    151     vld1.s8     d1, [r8]!                   @load from src[2nt+1]
    152 
    153     vpaddl.u8   d2, d0
    154 
    155     vmov        d6, r4, r5                  @store nt to accumulate
    156     vpaddl.u8   d3, d1
    157 
    158     vld1.s8     d0, [r6]!                   @load from src[nt] (extra load for 8)
    159 
    160     vld1.s8     d1, [r8]!                   @load from src[2nt+1] (extra load for 8)
    161     vadd.u16    d4, d2, d3
    162 
    163 
    164     vpaddl.u16  d5, d4
    165 
    166 
    167     vpadal.u32  d6, d5                      @accumulate all inp into d6 (end for nt==8)
    168 
    169     subs        r10, #8
    170     beq         epil_add_loop
    171 
    172 core_loop_add:
    173     vpaddl.u8   d2, d0
    174     subs        r10, #8
    175     vpaddl.u8   d3, d1
    176 
    177 
    178 
    179     vadd.u16    d4, d2, d3
    180     vld1.s8     d0, [r6]!                   @load from src[nt] (extra load for 16)
    181 
    182     vpaddl.u16  d5, d4
    183     vld1.s8     d1, [r8]!                   @load from src[2nt+1] (extra load for 16)
    184 
    185     vpadal.u32  d6, d5                      @accumulate all inp into d6
    186     bne         core_loop_add
    187 
    188 epil_add_loop:
    189 
    190     vshl.s64    d9, d6, d8                  @(dc_val) shr by log2nt+1
    191     cmp         r4, #32
    192 
    193     vmov        d28, r14, r5                @src[2nt+1]+2+src[2nt-1] moved to d28
    194     moveq       r6, #128
    195 
    196     vdup.8      d16, d9[0]                  @dc_val
    197     vshl.s64    d13, d9, #1                 @2*dc
    198 
    199     beq         prologue_cpy_32
    200 
    201     vadd.i64    d14, d13, d28               @src[2nt+1]+2+src[2nt-1]+2dc_val
    202     movne       r6, #0                      @nt
    203 
    204     vshr.u16    d15, d14, #2                @final dst[0]'s value in d15[0]
    205     movne       r10, r4
    206 
    207     vadd.i64    d11, d13, d9                @3*dc
    208     sub         r12, r3, r3, lsl #3         @-7*strd
    209 
    210     vadd.i64    d11, d11, d17               @3*dc + 2
    211     add         r12, r12, #8                @offset after one 8x8 block (-7*strd + 8)
    212 
    213     vdup.16     q12, d11[0]                 @3*dc + 2 (moved to all lanes)
    214     sub         r0, r3, r4                  @strd - nt
    215 
    216 prologue_col:
    217     @0th column and 0-7 rows done here
    218     @r8 and r9 (2nt+1+col 2nt-1-row)
    219 
    220     mov         r8, r7                      @&src[2nt+1]
    221 
    222     add         r0, r0, #8                  @strd - nt + 8
    223     vld1.s8     d0, [r8]!                   @col 1::7 load (prol)
    224     sub         r9, r9, #7                  @&src[2nt-1-row]
    225 
    226     vld1.s8     d1, [r9]                    @row 7::1 (0 also) load (prol)
    227     sub         r9, r9, #8
    228 
    229     vmovl.u8    q10, d0
    230 
    231     vld1.s8     d6, [r8]                    @col 8::15 load (prol extra)
    232     vadd.i16    q10, q10, q12               @col 1::7 add 3dc+2 (prol)
    233 
    234     vmovl.u8    q11, d1
    235     vqshrun.s16 d2, q10, #2                 @columns shr2 movn (prol)
    236 
    237     vmovl.u8    q13, d6
    238     vadd.i16    q11, q11, q12               @row 1::7 add 3dc+2 (prol)
    239 
    240     vmov.i64    d19, #0x00000000000000ff    @
    241     vqshrun.s16 d3, q11, #2                 @rows shr2 movn (prol)
    242 
    243     vbsl        d19, d15, d2                @first row with dst[0]
    244     vadd.i16    q13, q13, q12               @col 8::15 add 3dc+2 (prol extra)
    245 
    246     vrev64.8    d3, d3
    247 
    248     vst1.8      d19, [r2], r3               @store row 0 (prol)
    249     vshr.s64    d3, d3, #8                  @row 0 shift (prol) (first value to be ignored)
    250 
    251     vmov.i64    d20, #0x00000000000000ff    @byte mask row 1 (prol)
    252 
    253 loop_again_col_row:
    254 
    255     vbsl        d20, d3, d16                @row 1  (prol)
    256 
    257     vmov.i64    d21, #0x00000000000000ff    @byte mask row 2 (prol)
    258     vshr.s64    d3, d3, #8                  @row 1 shift (prol)
    259 
    260     vst1.8      d20, [r2], r3               @store row 1 (prol)
    261     vqshrun.s16 d4, q13, #2                 @columns shr2 movn (prol extra)
    262 
    263 
    264     vbsl        d21, d3, d16                @row 2 (prol)
    265 
    266     vmov.i64    d20, #0x00000000000000ff    @byte mask row 3 (prol)
    267     vshr.s64    d3, d3, #8                  @row 2 shift (prol)
    268 
    269     vst1.8      d21, [r2], r3               @store row 2 (prol)
    270 
    271 
    272     vbsl        d20, d3, d16                @row 3  (prol)
    273 
    274     vmov.i64    d21, #0x00000000000000ff    @byte mask row 4 (prol)
    275     vshr.s64    d3, d3, #8                  @row 3 shift (prol)
    276 
    277     vst1.8      d20, [r2], r3               @store row 3 (prol)
    278 
    279 
    280     vbsl        d21, d3, d16                @row 4 (prol)
    281 
    282     vmov.i64    d20, #0x00000000000000ff    @byte mask row 5 (prol)
    283     vshr.s64    d3, d3, #8                  @row 4 shift (prol)
    284 
    285     vst1.8      d21, [r2], r3               @store row 4 (prol)
    286 
    287 
    288     vbsl        d20, d3, d16                @row 5 (prol)
    289 
    290     vmov.i64    d21, #0x00000000000000ff    @byte mask row 6 (prol)
    291     vshr.s64    d3, d3, #8                  @row 5 shift (prol)
    292 
    293     vst1.8      d20, [r2], r3               @store row 5 (prol)
    294 
    295     vld1.s8     d1, [r9]                    @row 8::15 load (prol extra)
    296 
    297     vbsl        d21, d3, d16                @row 6 (prol)
    298 
    299     vmovl.u8    q11, d1
    300 
    301     vmov.i64    d20, #0x00000000000000ff    @byte mask row 7 (prol)
    302     vshr.s64    d3, d3, #8                  @row 6 shift (prol)
    303 
    304     vst1.8      d21, [r2], r3               @store row 6 (prol)
    305 
    306     vbsl        d20, d3, d16                @row 7 (prol)
    307     vadd.i16    q11, q11, q12               @row 8::15 add 3dc+2 (prol extra)
    308 
    309     vshr.s64    d3, d3, #8                  @row 7 shift (prol)
    310     vst1.8      d20, [r2], r12              @store row 7 (prol)
    311 
    312     subs        r10, r10, #8                @counter for cols
    313 
    314     beq         end_func
    315     blt         copy_16
    316 
    317 
    318     vmov.i64    d20, #0x00000000000000ff    @byte mask row 9 (prol)
    319     vqshrun.s16 d3, q11, #2                 @rows shr2 movn (prol)
    320 
    321     vrev64.8    d3, d3
    322 
    323     vst1.8      d4, [r2], r3                @store 2nd col (for 16x16)
    324 
    325     vst1.8      d16, [r2], r3
    326     vst1.8      d16, [r2], r3
    327     vst1.8      d16, [r2], r3
    328     vst1.8      d16, [r2], r3
    329     vst1.8      d16, [r2], r3
    330     vst1.8      d16, [r2], r3
    331     vst1.8      d16, [r2], r0               @go to next row for 16
    332 
    333 
    334     vbsl        d20, d3, d16                @row 9  (prol)
    335     subs        r10, r10, #8
    336 
    337     vst1.8      d20, [r2], r3               @store row 9 (prol)
    338     vshr.s64    d3, d3, #8                  @row 9 shift (prol)
    339 
    340     vmov.i64    d20, #0x00000000000000ff    @byte mask row 9 (prol)
    341 
    342     b           loop_again_col_row
    343 
    344 
    345 copy_16:
    346     vst1.8      d16, [r2], r3
    347     vst1.8      d16, [r2], r3
    348     vst1.8      d16, [r2], r3
    349     vst1.8      d16, [r2], r3
    350     vst1.8      d16, [r2], r3
    351     vst1.8      d16, [r2], r3
    352     vst1.8      d16, [r2], r3
    353     vst1.8      d16, [r2]
    354 
    355     b           end_func
    356 
    357 prologue_cpy_32:
    358     mov         r9, #128
    359     @sub        r7, r3, #-24
    360     add         r5, r2, r3
    361     add         r8, r5, r3
    362     add         r10, r8, r3
    363     vdup.8      q10, d16[0]
    364     lsl         r6, r3, #2
    365     add         r6, r6, #0xfffffff0
    366 
    367     vst1.8      {d20,d21}, [r2]!
    368     vst1.8      {d20,d21}, [r5]!
    369     vst1.8      {d20,d21}, [r8]!
    370     vst1.8      {d20,d21}, [r10]!
    371 
    372     vst1.8      {d20,d21}, [r2], r6
    373     vst1.8      {d20,d21}, [r5], r6
    374     vst1.8      {d20,d21}, [r8], r6
    375     vst1.8      {d20,d21}, [r10], r6
    376 
    377     sub         r9, r9, #32                 @32x32 prol/epil counter dec
    378 
    379 kernel_copy:
    380     vst1.8      {d20,d21}, [r2]!
    381     vst1.8      {d20,d21}, [r5]!
    382     vst1.8      {d20,d21}, [r8]!
    383     vst1.8      {d20,d21}, [r10]!
    384 
    385     vst1.8      {d20,d21}, [r2], r6
    386     vst1.8      {d20,d21}, [r5], r6
    387     vst1.8      {d20,d21}, [r8], r6
    388     vst1.8      {d20,d21}, [r10], r6
    389 
    390     subs        r9, r9, #32
    391 
    392     vst1.8      {d20,d21}, [r2]!
    393     vst1.8      {d20,d21}, [r5]!
    394     vst1.8      {d20,d21}, [r8]!
    395     vst1.8      {d20,d21}, [r10]!
    396 
    397     vst1.8      {d20,d21}, [r2], r6
    398     vst1.8      {d20,d21}, [r5], r6
    399     vst1.8      {d20,d21}, [r8], r6
    400     vst1.8      {d20,d21}, [r10], r6
    401 
    402     bne         kernel_copy
    403 
    404 epilogue_copy:
    405     vst1.8      {d20,d21}, [r2]!
    406     vst1.8      {d20,d21}, [r5]!
    407     vst1.8      {d20,d21}, [r8]!
    408     vst1.8      {d20,d21}, [r10]!
    409 
    410     vst1.8      {d20,d21}, [r2]
    411     vst1.8      {d20,d21}, [r5]
    412     vst1.8      {d20,d21}, [r8]
    413     vst1.8      {d20,d21}, [r10]
    414 
    415     b           end_func
    416 
    417 
    418 dc_4:
    419     vld1.s8     d0, [r6]!                   @load from src[nt]
    420     vld1.s8     d1, [r8]!                   @load from src[2nt+1]
    421 
    422     vpaddl.u8   d2, d0
    423     mov         r5, #0                      @
    424     vmov        d6, r4, r5                  @store nt to accumulate
    425     vpaddl.u8   d3, d1
    426 
    427     vadd.u16    d4, d2, d3
    428 
    429 
    430     vpaddl.u16  d5, d4
    431     vmov.i64    d30, #0x00000000ffffffff
    432 
    433     vand        d5, d5, d30
    434 
    435     vmov        d28, r14, r5                @src[2nt+1]+2+src[2nt-1] moved to d28
    436     vadd.i64    d6, d6, d5                  @accumulate all inp into d6 (end for nt==8)
    437 
    438     vshl.s64    d9, d6, d8                  @(dc_val) shr by log2nt+1
    439     mov         r8, r7                      @&src[2nt+1]
    440 
    441     vshl.s64    d13, d9, #1                 @2*dc
    442     sub         r9, r9, #3                  @&src[2nt-1-row]
    443 
    444     vdup.8      d16, d9[0]                  @dc_val
    445     vadd.i64    d14, d13, d28               @src[2nt+1]+2+src[2nt-1]+2dc_val
    446 
    447     vshr.u16    d15, d14, #2                @final dst[0]'s value in d15[0]
    448     sub         r12, r3, r3, lsl #2         @-3*strd
    449     vadd.i64    d11, d13, d9                @3*dc
    450 
    451     vadd.i64    d11, d11, d17               @3*dc + 2
    452     add         r12, r12, #4                @offset after one 4x4 block (-3*strd + 4)
    453 
    454     vdup.16     q12, d11[0]                 @3*dc + 2 (moved to all lanes)
    455     sub         r0, r3, r4                  @strd - nt
    456 
    457 
    458     vld1.s8     d0, [r8]                    @col 1::3 load (prol)
    459     vld1.s8     d1, [r9]                    @row 3::1 (0 also) load (prol)
    460 
    461     vmovl.u8    q10, d0
    462 
    463     vmovl.u8    q11, d1
    464     vadd.i16    q10, q10, q12               @col 1::7 add 3dc+2 (prol)
    465 
    466     vadd.i16    q11, q11, q12               @row 1::7 add 3dc+2 (prol)
    467 
    468     vmov.i64    d19, #0x00000000000000ff    @
    469     vqshrun.s16 d2, q10, #2                 @columns shr2 movn (prol)
    470 
    471     vmov.i64    d20, #0x00000000000000ff    @byte mask row 1 (prol)
    472     vqshrun.s16 d3, q11, #2                 @rows shr2 movn (prol)
    473 
    474 
    475     vbsl        d19, d15, d2                @first row with dst[0]
    476 
    477     vrev64.8    d3, d3
    478 
    479     vst1.32     d19[0], [r2], r3            @store row 0 (prol)
    480     vshr.s64    d3, d3, #40                 @row 0 shift (prol) (first value to be ignored)
    481 
    482     vmov.i64    d21, #0x00000000000000ff    @byte mask row 2 (prol)
    483 
    484     vbsl        d20, d3, d16                @row 1  (prol)
    485     vshr.s64    d3, d3, #8                  @row 1 shift (prol)
    486 
    487     vst1.32     d20[0], [r2], r3            @store row 1 (prol)
    488 
    489     vbsl        d21, d3, d16                @row 2 (prol)
    490 
    491     vmov.i64    d20, #0x00000000000000ff    @byte mask row 3 (prol)
    492 
    493     vshr.s64    d3, d3, #8                  @row 2 shift (prol)
    494     vst1.32     d21[0], [r2], r3            @store row 2 (prol)
    495 
    496     vbsl        d20, d3, d16                @row 3  (prol)
    497     vst1.32     d20[0], [r2]                @store row 3 (prol)
    498 
    499 epilogue_end:
    500 end_func:
    501     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    502 
    503 
    504 
    505 
    506 
    507 
    508 
    509