Home | History | Annotate | Download | only in armv8
      1 //******************************************************************************
      2 //*
      3 //* Copyright (C) 2015 The Android Open Source Project
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************
     18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 //*/
     20 
     21 ///*
     22 ////----------------------------------------------------------------------------
     23 //// File Name            : impeg2_format_conv.s
     24 ////
     25 //// Description          : This file has the Idct Implementations for the
     26 ////                        MPEG4 SP decoder on neon platform.
     27 ////
     28 //// Reference Document   :
     29 ////
     30 //// Revision History     :
     31 ////      Date            Author                  Detail Description
     32 ////   ------------    ----------------    ----------------------------------
     33 ////   Jul 07, 2008     Naveen Kumar T                Created
     34 ////
     35 ////-------------------------------------------------------------------------
     36 //*/
     37 
     38 ///*
     39 //// ----------------------------------------------------------------------------
     40 //// Include Files
     41 //// ----------------------------------------------------------------------------
     42 //*/
     43 .set log2_16                    ,      4
     44 .set log2_2                     ,      1
     45 
     46 .text
     47 .include "impeg2_neon_macros.s"
     48 ///*
     49 //// ----------------------------------------------------------------------------
     50 //// Struct/Union Types and Define
     51 //// ----------------------------------------------------------------------------
     52 //*/
     53 
     54 ///*
     55 //// ----------------------------------------------------------------------------
     56 //// Static Global Data section variables
     57 //// ----------------------------------------------------------------------------
     58 //*/
     59 ////--------------------------- NONE --------------------------------------------
     60 
     61 ///*
     62 //// ----------------------------------------------------------------------------
     63 //// Static Prototype Functions
     64 //// ----------------------------------------------------------------------------
     65 //*/
     66 //// -------------------------- NONE --------------------------------------------
     67 
     68 ///*
     69 //// ----------------------------------------------------------------------------
     70 //// Exported functions
     71 //// ----------------------------------------------------------------------------
     72 //*/
     73 
     74 
     75 ///*****************************************************************************
     76 //*                                                                            *
     77 //*  Function Name    : impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8()                      *
     78 //*                                                                            *
     79 //*  Description      : This function conversts the image from YUV420P color   *
     80 //*                     space to 420SP color space(UV interleaved).           *
     81 //*                                                                            *
     82 //*  Arguments        : x0          pu1_y                                     *
     83 //*                     x1          pu1_u                                     *
     84 //*                     x2          pu1_v                                     *
     85 //*                     x3          pu1_dest_y                                *
     86 //*                     x4          pu1_dest_uv                               *
     87 //*                     x5          u2_height                                 *
     88 //*                     x6          u2_width                                  *
     89 //*                     x7          u2_stridey                                *
     90 //*                     sp, #80     u2_strideu                                *
     91 //*                     sp, #88     u2_stridev                                *
     92 //*                     sp, #96     u2_dest_stride_y                          *
     93 //*                     sp, #104    u2_dest_stride_uv                         *
     94 //*                     sp, #112    convert_uv_only                           *
     95 //*                                                                            *
     96 //*  Values Returned  : None                                                   *
     97 //*                                                                            *
     98 //*  Register Usage   : x8, x10, x16, x20, v0, v1                              *
     99 //*                                                                            *
    100 //*  Stack Usage      : 80 Bytes                                               *
    101 //*                                                                            *
    102 //*  Interruptibility : Interruptible                                          *
    103 //*                                                                            *
    104 //*  Known Limitations                                                         *
    105 //*       Assumptions: Image Width:     Assumed to be multiple of 16 and       *
    106 //*                     greater than or equal to 16                  *
    107 //*                     Image Height:    Assumed to be even.                   *
    108 //*                                                                            *
    109 //*  Revision History :                                                        *
    110 //*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
    111 //*         07 06 2010   Varshita        Draft                                 *
    112 //*         07 06 2010   Naveen Kr T     Completed                             *
    113 //*                                                                            *
    114 //*****************************************************************************/
    115 .global impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8
    116 impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8:
    117 
    118     //// push the registers on the stack
    119     //    pu1_y,                - x0
    120     //    pu1_u,                - x1
    121     //    pu1_v,                - x2
    122     //    pu1_dest_y,           - x3
    123     //    pu1_dest_uv,          - x4
    124     //    u2_height,            - x5
    125     //    u2_width,             - x6
    126     //    u2_stridey,           - x7
    127     //    u2_strideu,           - sp, #80
    128     //    u2_stridev,           - sp, #88
    129     //    u2_dest_stride_y,     - sp, #96
    130     //    u2_dest_stride_uv,    - sp, #104
    131     //    convert_uv_only       - sp, #112
    132     // STMFD sp!,{x4-x12,x14}
    133     push_v_regs
    134     stp             x19, x20, [sp, #-16]!
    135 
    136     ldr             w14, [sp, #112]     //// Load convert_uv_only
    137 
    138     cmp             w14, #1
    139     beq             yuv420sp_uv_chroma
    140     ///* Do the preprocessing before the main loops start */
    141     //// Load the parameters from stack
    142 
    143     ldr             w8, [sp, #96]       //// Load u2_dest_stride_y from stack
    144     uxtw            x8, w8
    145 
    146     sub             x7, x7, x6          //// Source increment
    147 
    148     sub             x8, x8, x6          //// Destination increment
    149 
    150 
    151 yuv420sp_uv_row_loop_y:
    152     mov             x16, x6
    153 
    154 yuv420sp_uv_col_loop_y:
    155     prfm            pldl1keep, [x0, #128]
    156     ld1             {v0.8b, v1.8b}, [x0], #16
    157     st1             {v0.8b, v1.8b}, [x3], #16
    158     sub             x16, x16, #16
    159     cmp             x16, #15
    160     bgt             yuv420sp_uv_col_loop_y
    161 
    162     cmp             x16, #0
    163     beq             yuv420sp_uv_row_loop__y
    164     ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
    165     ////Ex if width is 162, above loop will process 160 pixels. And
    166     ////Both source and destination will point to 146th pixel and then 16 bytes will be read
    167     //// and written using VLD1 and VST1
    168     sub             x20, x16, #16
    169     neg             x16, x20
    170     sub             x0, x0, x16
    171     sub             x3, x3, x16
    172 
    173     ld1             {v0.8b, v1.8b}, [x0], #16
    174     st1             {v0.8b, v1.8b}, [x3], #16
    175 
    176 yuv420sp_uv_row_loop__y:
    177     add             x0, x0, x7
    178     add             x3, x3, x8
    179     subs            x5, x5, #1
    180     bgt             yuv420sp_uv_row_loop_y
    181 
    182 yuv420sp_uv_chroma:
    183     ldr             w7, [sp, #88]       //// Load u2_strideu from stack
    184     sxtw            x7, w7
    185 
    186     ldr             w8, [sp, #104]      //// Load u2_dest_stride_uv from stack
    187     sxtw            x8, w8
    188 
    189     sub             x7, x7, x6, lsr #1  //// Source increment
    190 
    191     sub             x8, x8, x6          //// Destination increment
    192 
    193     lsr             x6, x6, #1
    194     lsr             x5, x5, #1
    195 yuv420sp_uv_row_loop_uv:
    196     mov             x16, x6
    197 
    198 
    199 yuv420sp_uv_col_loop_uv:
    200     prfm            pldl1keep, [x1, #128]
    201     prfm            pldl1keep, [x2, #128]
    202 
    203     ld1             {v0.8b}, [x1], #8
    204     ld1             {v1.8b}, [x2], #8
    205     st2             {v0.8b, v1.8b}, [x4], #16
    206 
    207     sub             x16, x16, #8
    208     cmp             x16, #7
    209     bgt             yuv420sp_uv_col_loop_uv
    210 
    211     cmp             x16, #0
    212     beq             yuv420sp_uv_row_loop__uv
    213     ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
    214     ////Ex if width is 162, above loop will process 160 pixels. And
    215     ////Both source and destination will point to 146th pixel and then 16 bytes will be read
    216     //// and written using VLD1 and VST1
    217     sub             x20, x16, #8
    218     neg             x16, x20
    219     sub             x1, x1, x16
    220     sub             x2, x2, x16
    221     sub             x4, x4, x16, lsl #1
    222 
    223     ld1             {v0.8b}, [x1], #8
    224     ld1             {v1.8b}, [x2], #8
    225     st2             {v0.8b, v1.8b}, [x4], #16
    226 
    227 yuv420sp_uv_row_loop__uv:
    228     add             x1, x1, x7
    229     add             x2, x2, x7
    230     add             x4, x4, x8
    231     subs            x5, x5, #1
    232     bgt             yuv420sp_uv_row_loop_uv
    233     ////POP THE REGISTERS
    234     // LDMFD sp!,{x4-x12,PC}
    235     ldp             x19, x20, [sp], #16
    236     pop_v_regs
    237     ret
    238 
    239 
    240 
    241 
    242 
    243 ///*****************************************************************************
    244 //*                                                                            *
    245 //*  Function Name    : impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8()                      *
    246 //*                                                                            *
    247 //*  Description      : This function conversts the image from YUV420P color   *
    248 //*                     space to 420SP color space(VU interleaved).           *
    249 //*               This function is similar to above function          *
    250 //*               IMP4D_CXA8_YUV420toYUV420SP_VU with a difference in   *
    251 //*               VLD1.8 for chroma - order of registers is different    *
    252 //*                                                                            *
    253 //*  Arguments        : x0          pu1_y                                     *
    254 //*                     x1          pu1_u                                     *
    255 //*                     x2          pu1_v                                     *
    256 //*                     x3          pu1_dest_y                                *
    257 //*                     x4          pu1_dest_uv                               *
    258 //*                     x5          u2_height                                 *
    259 //*                     x6          u2_width                                  *
    260 //*                     x7          u2_stridey                                *
    261 //*                     sp, #80     u2_strideu                                *
    262 //*                     sp, #88     u2_stridev                                *
    263 //*                     sp, #96     u2_dest_stride_y                          *
    264 //*                     sp, #104    u2_dest_stride_uv                         *
    265 //*                     sp, #112    convert_uv_only                           *
    266 //*                                                                            *
    267 //*  Values Returned  : None                                                   *
    268 //*                                                                            *
    269 //*  Register Usage   : x8, x14, x16, x20, v0, v1                              *
    270 //*                                                                            *
    271 //*  Stack Usage      : 80 Bytes                                               *
    272 //*                                                                            *
    273 //*  Interruptibility : Interruptible                                          *
    274 //*                                                                            *
    275 //*  Known Limitations                                                         *
    276 //*       Assumptions: Image Width:     Assumed to be multiple of 16 and       *
    277 //*                     greater than or equal to 16                  *
    278 //*                     Image Height:    Assumed to be even.                   *
    279 //*                                                                            *
    280 //*  Revision History :                                                        *
    281 //*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
    282 //*         07 06 2010   Varshita        Draft                                 *
    283 //*         07 06 2010   Naveen Kr T     Completed                             *
    284 //*                                                                            *
    285 //*****************************************************************************/
    286 
    287 .global impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8
    288 impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8:
    289 
    290     //// push the registers on the stack
    291     //    pu1_y,                - x0
    292     //    pu1_u,                - x1
    293     //    pu1_v,                - x2
    294     //    pu1_dest_y,           - x3
    295     //    pu1_dest_uv,          - x4
    296     //    u2_height,            - x5
    297     //    u2_width,             - x6
    298     //    u2_stridey,           - x7
    299     //    u2_strideu,           - sp, #80
    300     //    u2_stridev,           - sp, #88
    301     //    u2_dest_stride_y,     - sp, #96
    302     //    u2_dest_stride_uv,    - sp, #104
    303     //    convert_uv_only       - sp, #112
    304     // STMFD sp!,{x4-x12,x14}
    305     push_v_regs
    306     stp             x19, x20, [sp, #-16]!
    307 
    308     ldr             w14, [sp, #112]     //// Load convert_uv_only
    309 
    310     cmp             w14, #1
    311     beq             yuv420sp_vu_chroma
    312 
    313     ///* Do the preprocessing before the main loops start */
    314     //// Load the parameters from stack
    315 
    316     ldr             w8, [sp, #96]       //// Load u2_dest_stride_y from stack
    317     uxtw            x8, w8
    318 
    319     sub             x7, x7, x6          //// Source increment
    320 
    321     sub             x8, x8, x6          //// Destination increment
    322 
    323 
    324 yuv420sp_vu_row_loop_y:
    325     mov             x16, x6
    326 
    327 yuv420sp_vu_col_loop_y:
    328     prfm            pldl1keep, [x0, #128]
    329     ld1             {v0.8b, v1.8b}, [x0], #16
    330     st1             {v0.8b, v1.8b}, [x3], #16
    331     sub             x16, x16, #16
    332     cmp             x16, #15
    333     bgt             yuv420sp_vu_col_loop_y
    334 
    335     cmp             x16, #0
    336     beq             yuv420sp_vu_row_loop__y
    337     ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
    338     ////Ex if width is 162, above loop will process 160 pixels. And
    339     ////Both source and destination will point to 146th pixel and then 16 bytes will be read
    340     //// and written using VLD1 and VST1
    341     sub             x20, x16, #16
    342     neg             x16, x20
    343     sub             x0, x0, x16
    344     sub             x3, x3, x16
    345 
    346     ld1             {v0.8b, v1.8b}, [x0], #16
    347     st1             {v0.8b, v1.8b}, [x3], #16
    348 
    349 yuv420sp_vu_row_loop__y:
    350     add             x0, x0, x7
    351     add             x3, x3, x8
    352     subs            x5, x5, #1
    353     bgt             yuv420sp_vu_row_loop_y
    354 
    355 yuv420sp_vu_chroma:
    356     ldr             w7, [sp, #80]       //// Load u2_strideu from stack
    357     sxtw            x7, w7
    358 
    359     ldr             w8, [sp, #104]      //// Load u2_dest_stride_uv from stack
    360     sxtw            x8, w8
    361 
    362     sub             x7, x7, x6, lsr #1  //// Source increment
    363 
    364     sub             x8, x8, x6          //// Destination increment
    365 
    366     lsr             x6, x6, #1
    367     lsr             x5, x5, #1
    368 yuv420sp_vu_row_loop_uv:
    369     mov             x16, x6
    370 
    371 
    372 yuv420sp_vu_col_loop_uv:
    373     prfm            pldl1keep, [x1, #128]
    374     prfm            pldl1keep, [x2, #128]
    375     ld1             {v1.8b}, [x1], #8
    376     ld1             {v0.8b}, [x2], #8
    377     st2             {v0.8b, v1.8b}, [x4], #16
    378     sub             x16, x16, #8
    379     cmp             x16, #7
    380     bgt             yuv420sp_vu_col_loop_uv
    381 
    382     cmp             x16, #0
    383     beq             yuv420sp_vu_row_loop__uv
    384     ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
    385     ////Ex if width is 162, above loop will process 160 pixels. And
    386     ////Both source and destination will point to 146th pixel and then 16 bytes will be read
    387     //// and written using VLD1 and VST1
    388     sub             x20, x16, #8
    389     neg             x16, x20
    390     sub             x1, x1, x16
    391     sub             x2, x2, x16
    392     sub             x4, x4, x16, lsl #1
    393 
    394     ld1             {v1.8b}, [x1], #8
    395     ld1             {v0.8b}, [x2], #8
    396     st2             {v0.8b, v1.8b}, [x4], #16
    397 
    398 yuv420sp_vu_row_loop__uv:
    399     add             x1, x1, x7
    400     add             x2, x2, x7
    401     add             x4, x4, x8
    402     subs            x5, x5, #1
    403     bgt             yuv420sp_vu_row_loop_uv
    404     ////POP THE REGISTERS
    405     // LDMFD sp!,{x4-x12,PC}
    406     ldp             x19, x20, [sp], #16
    407     pop_v_regs
    408     ret
    409 
    410