Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @ *******************************************************************************
     20 @ * @file
     21 @ *  ihevc_padding_neon.s
     22 @ *
     23 @ * @brief
     24 @ *  contains function definitions padding
     25 @ *
     26 @ * @author
     27 @ *  naveen sr
     28 @ *
     29 @ * @par list of functions:
     30 @ *  - ihevc_pad_left_luma()
     31 @ *  - ihevc_pad_left_chroma()
     32 @ *
     33 @ * @remarks
     34 @ *  none
     35 @ *
     36 @ *******************************************************************************
     37 @*/
     38 
     39 @/**
     40 @*******************************************************************************
     41 @*
     42 @* @brief
     43 @*   padding (luma block) at the left of a 2d array
     44 @*
     45 @* @par description:
     46 @*   the left column of a 2d array is replicated for pad_size times at the left
     47 @*
     48 @*
     49 @* @param[in] pu1_src
     50 @*  uword8 pointer to the source
     51 @*
     52 @* @param[in] src_strd
     53 @*  integer source stride
     54 @*
     55 @* @param[in] ht
     56 @*  integer height of the array
     57 @*
     58 @* @param[in] wd
     59 @*  integer width of the array
     60 @*
     61 @* @param[in] pad_size
     62 @*  integer -padding size of the array
     63 @*
     64 @* @param[in] ht
     65 @*  integer height of the array
     66 @*
     67 @* @param[in] wd
     68 @*  integer width of the array
     69 @*
     70 @* @returns
     71 @*
     72 @* @remarks
     73 @*  none
     74 @*
     75 @*******************************************************************************
     76 @*/
     77 @.if pad_left_luma == c
     78 @void ihevc_pad_left_luma(uword8 *pu1_src,
     79 @                        word32 src_strd,
     80 @                        word32 ht,
     81 @                        word32 pad_size)
     82 @**************variables vs registers*************************
     83 @   r0 => *pu1_src
     84 @   r1 => src_strd
     85 @   r2 => ht
     86 @   r3 => pad_size
     87 
     88 .text
     89 .align 4
     90 
     91 
     92 
     93 
     94 .globl ihevc_pad_left_luma_a9q
     95 
     96 .type ihevc_pad_left_luma_a9q, %function
     97 
     98 ihevc_pad_left_luma_a9q:
     99 
    100     stmfd       sp!, {r4-r11,lr}            @stack stores the values of the arguments
    101 
    102 loop_start_luma_left:
    103     @ pad size is assumed to be pad_left = 80
    104     sub         r4,r0,r3
    105 
    106     ldrb        r8,[r0]
    107     add         r0,r1
    108     ldrb        r9,[r0]
    109     add         r0,r1
    110     ldrb        r10,[r0]
    111     add         r0,r1
    112     ldrb        r11,[r0]
    113     add         r0,r1
    114 
    115     vdup.u8     q0,r8
    116     vdup.u8     q1,r9
    117     vdup.u8     q2,r10
    118     vdup.u8     q3,r11
    119 
    120     add         r5,r4,r1
    121 
    122     vst1.8      {d0,d1},[r4]!               @128/8 = 16 bytes store
    123     vst1.8      {d0,d1},[r4]!               @ 16 bytes store
    124     vst1.8      {d0,d1},[r4]!               @ 16 bytes store
    125     vst1.8      {d0,d1},[r4]!               @ 16 bytes store
    126     vst1.8      {d0,d1},[r4]                @ 16 bytes store
    127 
    128     add         r6,r5,r1
    129 
    130     vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
    131     vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
    132     vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
    133     vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
    134     vst1.8      {d2,d3},[r5]                @128/8 = 16 bytes store
    135 
    136     add         r7,r6,r1
    137 
    138     vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
    139     vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
    140     vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
    141     vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
    142     vst1.8      {d4,d5},[r6]                @128/8 = 16 bytes store
    143 
    144     subs        r2,#4
    145 
    146     vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
    147     vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
    148     vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
    149     vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
    150     vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
    151 
    152     @ total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store
    153 
    154     bne         loop_start_luma_left
    155 
    156     ldmfd       sp!,{r4-r11,pc}             @reload the registers from sp
    157 
    158 
    159 
    160 
    161 
    162 @/**
    163 @*******************************************************************************
    164 @*
    165 @* @brief
    166 @*   padding (chroma block) at the left of a 2d array
    167 @*
    168 @* @par description:
    169 @*   the left column of a 2d array is replicated for pad_size times at the left
    170 @*
    171 @*
    172 @* @param[in] pu1_src
    173 @*  uword8 pointer to the source
    174 @*
    175 @* @param[in] src_strd
    176 @*  integer source stride
    177 @*
    178 @* @param[in] ht
    179 @*  integer height of the array
    180 @*
    181 @* @param[in] wd
    182 @*  integer width of the array (each colour component)
    183 @*
    184 @* @param[in] pad_size
    185 @*  integer -padding size of the array
    186 @*
    187 @* @param[in] ht
    188 @*  integer height of the array
    189 @*
    190 @* @param[in] wd
    191 @*  integer width of the array
    192 @*
    193 @* @returns
    194 @*
    195 @* @remarks
    196 @*  none
    197 @*
    198 @*******************************************************************************
    199 @*/
    200 @.if pad_left_chroma == c
    201 @void ihevc_pad_left_chroma(uword8 *pu1_src,
    202 @                            word32 src_strd,
    203 @                            word32 ht,
    204 @                            word32 pad_size)
    205 @{
    206 @   r0 => *pu1_src
    207 @   r1 => src_strd
    208 @   r2 => ht
    209 @   r3 => pad_size
    210 
    211 
    212 
    213 .globl ihevc_pad_left_chroma_a9q
    214 
    215 .type ihevc_pad_left_chroma_a9q, %function
    216 
    217 ihevc_pad_left_chroma_a9q:
    218 
    219     stmfd       sp!, {r4-r11, lr}           @stack stores the values of the arguments
    220 
    221 loop_start_chroma_left:
    222     @ pad size is assumed to be pad_left = 80
    223     sub         r4,r0,r3
    224 
    225     ldrh        r8,[r0]
    226     add         r0,r1
    227     ldrh        r9,[r0]
    228     add         r0,r1
    229     ldrh        r10,[r0]
    230     add         r0,r1
    231     ldrh        r11,[r0]
    232     add         r0,r1
    233 
    234     vdup.u16    q0,r8
    235     vdup.u16    q1,r9
    236     vdup.u16    q2,r10
    237     vdup.u16    q3,r11
    238 
    239     add         r5,r4,r1
    240 
    241     vst1.8      {d0,d1},[r4]!               @128/8 = 16 bytes store
    242     vst1.8      {d0,d1},[r4]!               @ 16 bytes store
    243     vst1.8      {d0,d1},[r4]!               @ 16 bytes store
    244     vst1.8      {d0,d1},[r4]!               @ 16 bytes store
    245     vst1.8      {d0,d1},[r4]                @ 16 bytes store
    246 
    247     add         r6,r5,r1
    248 
    249     vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
    250     vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
    251     vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
    252     vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
    253     vst1.8      {d2,d3},[r5]                @128/8 = 16 bytes store
    254 
    255     add         r7,r6,r1
    256 
    257     vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
    258     vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
    259     vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
    260     vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
    261     vst1.8      {d4,d5},[r6]                @128/8 = 16 bytes store
    262 
    263     subs        r2,#4
    264 
    265     vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
    266     vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
    267     vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
    268     vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
    269     vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
    270 
    271     @ total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store
    272 
    273     bne         loop_start_chroma_left
    274 
    275     ldmfd       sp!,{r4-r11,pc}             @reload the registers from sp
    276 
    277 
    278 
    279 
    280 
    281 @/**
    282 @*******************************************************************************
    283 @*
    284 @* @brief
    285 @* padding (luma block) at the right of a 2d array
    286 @*
    287 @* @par description:
    288 @* the right column of a 2d array is replicated for pad_size times at the right
    289 @*
    290 @*
    291 @* @param[in] pu1_src
    292 @*  uword8 pointer to the source
    293 @*
    294 @* @param[in] src_strd
    295 @*  integer source stride
    296 @*
    297 @* @param[in] ht
    298 @*  integer height of the array
    299 @*
    300 @* @param[in] wd
    301 @*  integer width of the array
    302 @*
    303 @* @param[in] pad_size
    304 @*  integer -padding size of the array
    305 @*
    306 @* @param[in] ht
    307 @*  integer height of the array
    308 @*
    309 @* @param[in] wd
    310 @*  integer width of the array
    311 @*
    312 @* @returns
    313 @*
    314 @* @remarks
    315 @*  none
    316 @*
    317 @*******************************************************************************
    318 @*/
    319 @.if pad_right_luma == c
    320 @void ihevc_pad_right_luma(uword8 *pu1_src,
    321 @                        word32 src_strd,
    322 @                        word32 ht,
    323 @                        word32 pad_size)
    324 @{
    325 @    word32 row@
    326 @
    327 @    for(row = 0@ row < ht@ row++)
    328 @    {
    329 @        memset(pu1_src, *(pu1_src -1), pad_size)@
    330 @
    331 @        pu1_src += src_strd@
    332 @    }
    333 @}
    334 @
    335 @   r0 => *pu1_src
    336 @   r1 => src_strd
    337 @   r2 => ht
    338 @   r3 => pad_size
    339 
    340 
    341 
    342 .globl ihevc_pad_right_luma_a9q
    343 
    344 .type ihevc_pad_right_luma_a9q, %function
    345 
    346 ihevc_pad_right_luma_a9q:
    347 
    348     stmfd       sp!, {r4-r11, lr}           @stack stores the values of the arguments
    349 
    350 loop_start_luma_right:
    351     @ pad size is assumed to be pad_left = 80
    352     mov         r4,r0
    353 
    354     ldrb        r8,[r0, #-1]
    355     add         r0,r1
    356     ldrb        r9,[r0, #-1]
    357     add         r0,r1
    358     ldrb        r10,[r0, #-1]
    359     add         r0,r1
    360     ldrb        r11,[r0, #-1]
    361     add         r0,r1
    362 
    363     add         r5,r4,r1
    364     add         r6,r5,r1
    365     add         r7,r6,r1
    366 
    367     vdup.u8     q0,r8
    368     vdup.u8     q1,r9
    369     vdup.u8     q2,r10
    370     vdup.u8     q3,r11
    371 
    372     vst1.8      {d0,d1},[r4]!               @128/8 = 16 bytes store
    373     vst1.8      {d0,d1},[r4]!               @ 16 bytes store
    374     vst1.8      {d0,d1},[r4]!               @ 16 bytes store
    375     vst1.8      {d0,d1},[r4]!               @ 16 bytes store
    376     vst1.8      {d0,d1},[r4]                @ 16 bytes store
    377 
    378 
    379     vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
    380     vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
    381     vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
    382     vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
    383     vst1.8      {d2,d3},[r5]                @128/8 = 16 bytes store
    384 
    385     subs        r2,#4
    386 
    387     vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
    388     vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
    389     vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
    390     vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
    391     vst1.8      {d4,d5},[r6]                @128/8 = 16 bytes store
    392 
    393     vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
    394     vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
    395     vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
    396     vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
    397     vst1.8      {d6,d7},[r7]                @128/8 = 16 bytes store
    398 
    399 
    400     @ total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store
    401 
    402 
    403     bne         loop_start_luma_right
    404 
    405     ldmfd       sp!,{r4-r11,pc}             @reload the registers from sp
    406 
    407 
    408 
    409 
    410 
    411 @/**
    412 @*******************************************************************************
    413 @*
    414 @* @brief
    415 @@* padding (chroma block) at the right of a 2d array
    416 @*
    417 @* @par description:
    418 @* the right column of a 2d array is replicated for pad_size times at the right
    419 @*
    420 @*
    421 @* @param[in] pu1_src
    422 @@*  uword8 pointer to the source
    423 @*
    424 @* @param[in] src_strd
    425 @*  integer source stride
    426 @*
    427 @* @param[in] ht
    428 @@*  integer height of the array
    429 @*
    430 @* @param[in] wd
    431 @*  integer width of the array (each colour component)
    432 @*
    433 @* @param[in] pad_size
    434 @*  integer -padding size of the array
    435 @*
    436 @* @param[in] ht
    437 @@*  integer height of the array
    438 @*
    439 @* @param[in] wd
    440 @*  integer width of the array
    441 @*
    442 @* @returns
    443 @*
    444 @* @remarks
    445 @*  none
    446 @*
    447 @*******************************************************************************
    448 @*/
    449 @.if pad_right_chroma == c
    450 @void ihevc_pad_right_chroma(uword8 *pu1_src,
    451 @                        word32 src_strd,
    452 @                        word32 ht,
    453 @                        word32 pad_size)
    454 @   r0 => *pu1_src
    455 @   r1 => src_strd
    456 @   r2 => ht
    457 @   r3 => pad_size
    458 
    459 
    460 
    461 .globl ihevc_pad_right_chroma_a9q
    462 
    463 .type ihevc_pad_right_chroma_a9q, %function
    464 
    465 ihevc_pad_right_chroma_a9q:
    466 
    467     stmfd       sp!, {r4-r11, lr}           @stack stores the values of the arguments
    468 
    469 loop_start_chroma_right:
    470     @ pad size is assumed to be pad_left = 80
    471     mov         r4,r0
    472 
    473     ldrh        r8,[r0, #-2]
    474     add         r0,r1
    475     ldrh        r9,[r0, #-2]
    476     add         r0,r1
    477     ldrh        r10,[r0, #-2]
    478     add         r0,r1
    479     ldrh        r11,[r0, #-2]
    480     add         r0,r1
    481 
    482     vdup.u16    q0,r8
    483     vdup.u16    q1,r9
    484     vdup.u16    q2,r10
    485     vdup.u16    q3,r11
    486 
    487     add         r5,r4,r1
    488 
    489     vst1.8      {d0,d1},[r4]!               @128/8 = 16 bytes store
    490     vst1.8      {d0,d1},[r4]!               @ 16 bytes store
    491     vst1.8      {d0,d1},[r4]!               @ 16 bytes store
    492     vst1.8      {d0,d1},[r4]!               @ 16 bytes store
    493     vst1.8      {d0,d1},[r4]                @ 16 bytes store
    494 
    495     add         r6,r5,r1
    496 
    497     vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
    498     vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
    499     vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
    500     vst1.8      {d2,d3},[r5]!               @128/8 = 16 bytes store
    501     vst1.8      {d2,d3},[r5]                @128/8 = 16 bytes store
    502 
    503     add         r7,r6,r1
    504 
    505     vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
    506     vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
    507     vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
    508     vst1.8      {d4,d5},[r6]!               @128/8 = 16 bytes store
    509     vst1.8      {d4,d5},[r6]                @128/8 = 16 bytes store
    510 
    511     subs        r2,#4
    512 
    513     vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
    514     vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
    515     vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
    516     vst1.8      {d6,d7},[r7]!               @128/8 = 16 bytes store
    517     vst1.8      {d6,d7},[r7]                @128/8 = 16 bytes store
    518 
    519     @ total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store
    520 
    521     bne         loop_start_chroma_right
    522 
    523     ldmfd       sp!,{r4-r11,pc}             @reload the registers from sp
    524 
    525 
    526 
    527 
    528 
    529 
    530 
    531 
    532