Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 // *******************************************************************************
     20 // * //file
     21 // *  ihevc_padding_neon.s
     22 // *
     23 // * //brief
     24 // *  contains function definitions padding
     25 // *
     26 // * //author
     27 // *     naveen sr
     28 // *
     29 // * //par list of functions:
     30 // *  - ihevc_pad_left_luma()
     31 // *  - ihevc_pad_left_chroma()
     32 // *
     33 // * //remarks
     34 // *  none
     35 // *
     36 // *******************************************************************************
     37 //*/
     38 
     39 ///**
     40 //*******************************************************************************
     41 //*
     42 //* //brief
     43 //*   padding (luma block) at the left of a 2d array
     44 //*
     45 //* //par description:
     46 //*   the left column of a 2d array is replicated for pad_size times at the left
     47 //*
     48 //*
     49 //* //param[in] pu1_src
     50 //*  uword8 pointer to the source
     51 //*
     52 //* //param[in] src_strd
     53 //*  integer source stride
     54 //*
     55 //* //param[in] ht
     56 //*  integer height of the array
     57 //*
     58 //* //param[in] wd
     59 //*  integer width of the array
     60 //*
     61 //* //param[in] pad_size
     62 //*  integer -padding size of the array
     63 //*
     64 //* //param[in] ht
     65 //*  integer height of the array
     66 //*
     67 //* //param[in] wd
     68 //*  integer width of the array
     69 //*
     70 //* //returns
     71 //*
     72 //* //remarks
     73 //*  none
     74 //*
     75 //*******************************************************************************
     76 //*/
     77 //.if pad_left_luma == c
     78 //void ihevc_pad_left_luma(uword8 *pu1_src,
     79 //                        word32 src_strd,
     80 //                        word32 ht,
     81 //                        word32 pad_size)
     82 //**************variables vs registers*************************
     83 //    x0 => *pu1_src
     84 //    x1 => src_strd
     85 //    x2 => ht
     86 //    x3 => pad_size
     87 
     88 .text
     89 .align 4
     90 
     91 .globl ihevc_pad_left_luma_av8
     92 
     93 .type ihevc_pad_left_luma_av8, %function
     94 
     95 ihevc_pad_left_luma_av8:
     96 
     97 loop_start_luma_left:
     98     // pad size is assumed to be pad_left = 80
     99     sub         x4,x0,x3
    100 
    101     ldrb        w8,[x0]
    102     add         x0,x0,x1
    103     ldrb        w9,[x0]
    104     add         x0,x0,x1
    105     ldrb        w10,[x0]
    106     add         x0,x0,x1
    107     ldrb        w11,[x0]
    108     add         x0,x0,x1
    109 
    110     dup         v0.16b,w8
    111     dup         v2.16b,w9
    112     dup         v4.16b,w10
    113     dup         v6.16b,w11
    114 
    115     add         x5,x4,x1
    116 
    117     st1         {v0.16b},[x4],#16           //128/8 = 16 bytes store
    118     st1         {v0.16b},[x4],#16           // 16 bytes store
    119     st1         {v0.16b},[x4],#16           // 16 bytes store
    120     st1         {v0.16b},[x4],#16           // 16 bytes store
    121     st1         {v0.16b},[x4]               // 16 bytes store
    122 
    123     add         x6,x5,x1
    124 
    125     st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
    126     st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
    127     st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
    128     st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
    129     st1         {v2.16b},[x5]               //128/8 = 16 bytes store
    130 
    131     add         x7,x6,x1
    132 
    133     st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
    134     st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
    135     st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
    136     st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
    137     st1         {v4.16b},[x6]               //128/8 = 16 bytes store
    138 
    139     subs        x2, x2,#4
    140 
    141     st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
    142     st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
    143     st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
    144     st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
    145     st1         {v6.16b},[x7]               //128/8 = 16 bytes store
    146 
    147     // total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store
    148 
    149     bne         loop_start_luma_left
    150 
    151     ret
    152 
    153 
    154 
    155 
    156 
    157 ///**
    158 //*******************************************************************************
    159 //*
    160 //* //brief
    161 //*   padding (chroma block) at the left of a 2d array
    162 //*
    163 //* //par description:
    164 //*   the left column of a 2d array is replicated for pad_size times at the left
    165 //*
    166 //*
    167 //* //param[in] pu1_src
    168 //*  uword8 pointer to the source
    169 //*
    170 //* //param[in] src_strd
    171 //*  integer source stride
    172 //*
    173 //* //param[in] ht
    174 //*  integer height of the array
    175 //*
    176 //* //param[in] wd
    177 //*  integer width of the array (each colour component)
    178 //*
    179 //* //param[in] pad_size
    180 //*  integer -padding size of the array
    181 //*
    182 //* //param[in] ht
    183 //*  integer height of the array
    184 //*
    185 //* //param[in] wd
    186 //*  integer width of the array
    187 //*
    188 //* //returns
    189 //*
    190 //* //remarks
    191 //*  none
    192 //*
    193 //*******************************************************************************
    194 //*/
    195 //.if pad_left_chroma == c
    196 //void ihevc_pad_left_chroma(uword8 *pu1_src,
    197 //                            word32 src_strd,
    198 //                            word32 ht,
    199 //                            word32 pad_size)
    200 //{
    201 //    x0 => *pu1_src
    202 //    x1 => src_strd
    203 //    x2 => ht
    204 //    x3 => pad_size
    205 
    206 
    207 
    208 .globl ihevc_pad_left_chroma_av8
    209 
    210 .type ihevc_pad_left_chroma_av8, %function
    211 
    212 ihevc_pad_left_chroma_av8:
    213 
    214 
    215 loop_start_chroma_left:
    216     // pad size is assumed to be pad_left = 80
    217     sub         x4,x0,x3
    218 
    219     ldrh        w8,[x0]
    220     add         x0,x0,x1
    221     ldrh        w9,[x0]
    222     add         x0,x0,x1
    223     ldrh        w10,[x0]
    224     add         x0,x0,x1
    225     ldrh        w11,[x0]
    226     add         x0,x0,x1
    227 
    228     dup         v0.8h,w8
    229     dup         v2.8h,w9
    230     dup         v4.8h,w10
    231     dup         v6.8h,w11
    232 
    233     add         x5,x4,x1
    234 
    235     st1         {v0.16b},[x4],#16           //128/8 = 16 bytes store
    236     st1         {v0.16b},[x4],#16           // 16 bytes store
    237     st1         {v0.16b},[x4],#16           // 16 bytes store
    238     st1         {v0.16b},[x4],#16           // 16 bytes store
    239     st1         {v0.16b},[x4]               // 16 bytes store
    240 
    241     add         x6,x5,x1
    242 
    243     st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
    244     st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
    245     st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
    246     st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
    247     st1         {v2.16b},[x5]               //128/8 = 16 bytes store
    248 
    249     add         x7,x6,x1
    250 
    251     st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
    252     st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
    253     st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
    254     st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
    255     st1         {v4.16b},[x6]               //128/8 = 16 bytes store
    256 
    257     subs        x2, x2,#4
    258 
    259     st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
    260     st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
    261     st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
    262     st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
    263     st1         {v6.16b},[x7]               //128/8 = 16 bytes store
    264 
    265     // total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store
    266 
    267     bne         loop_start_chroma_left
    268 
    269     ret
    270 
    271 
    272 
    273 
    274 
    275 ///**
    276 //*******************************************************************************
    277 //*
    278 //* //brief
    279 //* padding (luma block) at the right of a 2d array
    280 //*
    281 //* //par description:
    282 //* the right column of a 2d array is replicated for pad_size times at the right
    283 //*
    284 //*
    285 //* //param[in] pu1_src
    286 //*  uword8 pointer to the source
    287 //*
    288 //* //param[in] src_strd
    289 //*  integer source stride
    290 //*
    291 //* //param[in] ht
    292 //*  integer height of the array
    293 //*
    294 //* //param[in] wd
    295 //*  integer width of the array
    296 //*
    297 //* //param[in] pad_size
    298 //*  integer -padding size of the array
    299 //*
    300 //* //param[in] ht
    301 //*  integer height of the array
    302 //*
    303 //* //param[in] wd
    304 //*  integer width of the array
    305 //*
    306 //* //returns
    307 //*
    308 //* //remarks
    309 //*  none
    310 //*
    311 //*******************************************************************************
    312 //*/
    313 //.if pad_right_luma == c
    314 //void ihevc_pad_right_luma(uword8 *pu1_src,
    315 //                        word32 src_strd,
    316 //                        word32 ht,
    317 //                        word32 pad_size)
    318 //{
    319 //    word32 row//
    320 //
    321 //    for(row = 0// row < ht// row++)
    322 //    {
    323 //        memset(pu1_src, *(pu1_src -1), pad_size)//
    324 //
    325 //        pu1_src += src_strd//
    326 //    }
    327 //}
    328 //
    329 //    x0 => *pu1_src
    330 //    x1 => src_strd
    331 //    x2 => ht
    332 //    x3 => pad_size
    333 
    334 
    335 
    336 .globl ihevc_pad_right_luma_av8
    337 
    338 .type ihevc_pad_right_luma_av8, %function
    339 
    340 ihevc_pad_right_luma_av8:
    341 
    342 
    343 loop_start_luma_right:
    344     // pad size is assumed to be pad_left = 80
    345     mov         x4,x0
    346 
    347     ldrb        w8,[x0, #-1]
    348     add         x0,x0,x1
    349     ldrb        w9,[x0, #-1]
    350     add         x0,x0,x1
    351     ldrb        w10,[x0, #-1]
    352     add         x0,x0,x1
    353     ldrb        w11,[x0, #-1]
    354     add         x0,x0,x1
    355 
    356     add         x5,x4,x1
    357     add         x6,x5,x1
    358     add         x7,x6,x1
    359 
    360     dup         v0.16b,w8
    361     dup         v2.16b,w9
    362     dup         v4.16b,w10
    363     dup         v6.16b,w11
    364 
    365     st1         {v0.16b},[x4],#16           //128/8 = 16 bytes store
    366     st1         {v0.16b},[x4],#16           // 16 bytes store
    367     st1         {v0.16b},[x4],#16           // 16 bytes store
    368     st1         {v0.16b},[x4],#16           // 16 bytes store
    369     st1         {v0.16b},[x4]               // 16 bytes store
    370 
    371 
    372     st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
    373     st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
    374     st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
    375     st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
    376     st1         {v2.16b},[x5]               //128/8 = 16 bytes store
    377 
    378     subs        x2, x2,#4
    379 
    380     st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
    381     st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
    382     st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
    383     st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
    384     st1         {v4.16b},[x6]               //128/8 = 16 bytes store
    385 
    386     st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
    387     st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
    388     st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
    389     st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
    390     st1         {v6.16b},[x7]               //128/8 = 16 bytes store
    391 
    392 
    393     // total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store
    394 
    395 
    396     bne         loop_start_luma_right
    397 
    398     ret
    399 
    400 
    401 
    402 
    403 
    404 ///**
    405 //*******************************************************************************
    406 //*
    407 //* //brief
    408 ////* padding (chroma block) at the right of a 2d array
    409 //*
    410 //* //par description:
    411 //* the right column of a 2d array is replicated for pad_size times at the right
    412 //*
    413 //*
    414 //* //param[in] pu1_src
    415 ////*  uword8 pointer to the source
    416 //*
    417 //* //param[in] src_strd
    418 //*  integer source stride
    419 //*
    420 //* //param[in] ht
    421 ////*  integer height of the array
    422 //*
    423 //* //param[in] wd
    424 //*  integer width of the array (each colour component)
    425 //*
    426 //* //param[in] pad_size
    427 //*  integer -padding size of the array
    428 //*
    429 //* //param[in] ht
    430 ////*  integer height of the array
    431 //*
    432 //* //param[in] wd
    433 //*  integer width of the array
    434 //*
    435 //* //returns
    436 //*
    437 //* //remarks
    438 //*  none
    439 //*
    440 //*******************************************************************************
    441 //*/
    442 //.if pad_right_chroma == c
    443 //void ihevc_pad_right_chroma(uword8 *pu1_src,
    444 //                        word32 src_strd,
    445 //                        word32 ht,
    446 //                        word32 pad_size)
    447 //    x0 => *pu1_src
    448 //    x1 => src_strd
    449 //    x2 => ht
    450 //    x3 => pad_size
    451 
    452 
    453 
    454 .globl ihevc_pad_right_chroma_av8
    455 
    456 .type ihevc_pad_right_chroma_av8, %function
    457 
    458 ihevc_pad_right_chroma_av8:
    459 
    460 
    461 loop_start_chroma_right:
    462     // pad size is assumed to be pad_left = 80
    463     mov         x4,x0
    464 
    465     ldrh        w8,[x0, #-2]
    466     add         x0,x0,x1
    467     ldrh        w9,[x0, #-2]
    468     add         x0,x0,x1
    469     ldrh        w10,[x0, #-2]
    470     add         x0,x0,x1
    471     ldrh        w11,[x0, #-2]
    472     add         x0,x0,x1
    473 
    474     dup         v0.8h,w8
    475     dup         v2.8h,w9
    476     dup         v4.8h,w10
    477     dup         v6.8h,w11
    478 
    479     add         x5,x4,x1
    480 
    481     st1         {v0.16b},[x4],#16           //128/8 = 16 bytes store
    482     st1         {v0.16b},[x4],#16           // 16 bytes store
    483     st1         {v0.16b},[x4],#16           // 16 bytes store
    484     st1         {v0.16b},[x4],#16           // 16 bytes store
    485     st1         {v0.16b},[x4]               // 16 bytes store
    486 
    487     add         x6,x5,x1
    488 
    489     st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
    490     st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
    491     st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
    492     st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
    493     st1         {v2.16b},[x5]               //128/8 = 16 bytes store
    494 
    495     add         x7,x6,x1
    496 
    497     st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
    498     st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
    499     st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
    500     st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
    501     st1         {v4.16b},[x6]               //128/8 = 16 bytes store
    502 
    503     subs        x2, x2,#4
    504 
    505     st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
    506     st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
    507     st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
    508     st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
    509     st1         {v6.16b},[x7]               //128/8 = 16 bytes store
    510 
    511     // total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store
    512 
    513     bne         loop_start_chroma_right
    514 
    515     ret
    516 
    517 
    518 
    519 
    520 
    521 
    522 
    523 
    524