Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 ///*******************************************************************************
     20 //* //file
     21 //*  ihevcd_fmt_conv_420sp_to_420p.s
     22 //*
     23 //* //brief
     24 //*  contains function definitions for format conversions
     25 //*
     26 //* //author
     27 //*  ittiam
     28 //*
     29 //* //par list of functions:
     30 //*
     31 //*
     32 //* //remarks
     33 //*  none
     34 //*
     35 //*******************************************************************************/
     36 
     37 .text
     38 
     39 .include "ihevc_neon_macros.s"
     40 
     41 
     42 
     43 
     44 ///*****************************************************************************
     45 //*                                                                            *
     46 //*  Function Name    : neon_copy_yuv420sp_to_yuv420p()                       *
     47 //*                                                                            *
     48 //*  Description      : This function conversts the image from YUV420sP color  *
     49 //*                     space to 420SP color space(UV interleaved).                 *
     50 //*                                                                            *
     51 //*  Arguments        : x0           pu1_src_y                                 *
     52 //*                     x1           pu1_src_uv                                *
     53 //*                     x2           pu1_dest_y                                *
     54 //*                     x3           pu1_dest_u                               *
     55 //*                     [x13 #40]    pu1_dest_v                               *
     56 //*                     [x13 #44]    u2_width                                 *
     57 //*                     [x13 #48]    u2_height                                   *
     58 //*                     [x13 #52]    u2_stridey                                *
     59 //*                     [x13 #56]    u2_strideuv                               *
     60 //*                     [x13 #60]    u2_dest_stridey                           *
     61 //*                     [x13 #64]    u2_dest_strideuv                          *
     62 //*                     [x13 #68]    is_u_first                                *
     63 //*                     [x13 #72]    disable_luma_copy                         *
     64 //*                                                                            *
     65 //*  Values Returned  : None                                                   *
     66 //*                                                                            *
     67 //*  Register Usage   : x0 - x14                                               *
     68 //*                                                                            *
     69 //*  Stack Usage      : 40 Bytes                                               *
     70 //*                                                                            *
     71 //*  Interruptibility : Interruptible                                          *
     72 //*                                                                            *
     73 //*  Known Limitations                                                         *
     74 //*       Assumptions: Image Width:     Assumed to be multiple of 2 and       *
     75 //*                     Image Height:    Assumed to be even.                   *
     76 //*                                                                            *
     77 //*  Revision History :                                                        *
     78 //*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
     79 //*         16 05 2012   Naveen SR     draft                                     *
     80 //*                                                                            *
     81 //*****************************************************************************/
     82 
     83 .globl ihevcd_fmt_conv_420sp_to_420p_av8
     84 
     85 .type ihevcd_fmt_conv_420sp_to_420p_av8, %function
     86 
     87 ihevcd_fmt_conv_420sp_to_420p_av8:
     88     // STMFD sp!,{x4-x12, x14}
     89     push_v_regs
     90     stp         x19, x20,[sp,#-16]!
     91     mov         x15, x4
     92     mov         x8, x5                      ////Load u2_width
     93     mov         x9, x6                      ////Load u2_height
     94 
     95     LDR         w5, [sp,#88]                ////Load u2_dest_stridey
     96     sxtw        x5,w5
     97 //    LDR        x6,[sp,#80]                @//Load u2_strideuv
     98 
     99     SUB         x10,x7,x8                   //// Src Y increment
    100     SUB         x11,x5,x8                   //// Dst Y increment
    101 
    102     LDR         w5, [sp,#112]               ////Load disable_luma_copy flag
    103     sxtw        x5,w5
    104     CMP         x5,#0                       ////skip luma if disable_luma_copy is non-zero
    105     BNE         uv_copy_start
    106 
    107     ///* Copy Y */
    108 
    109     MOV         x4,x9                       //// Copying height
    110 y_row_loop:
    111     MOV         x6,x8                       //// Copying width
    112 
    113 y_col_loop:
    114 
    115     SUB         x6,x6,#16
    116     ld1         {v0.8b, v1.8b},[x0],#16
    117     st1         {v0.8b, v1.8b},[x2],#16
    118     CMP         x6,#16
    119     BGE         y_col_loop
    120     CMP         x6,#0
    121     BEQ         y_col_loop_end
    122     ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
    123     ////Ex if width is 162, above loop will process 160 pixels. And
    124     ////Both source and destination will point to 146th pixel and then 16 bytes will be read
    125     //// and written using VLD1 and VST1
    126     sub         x20,x6,#16
    127     neg         x6, x20
    128     SUB         x0,x0,x6
    129     SUB         x2,x2,x6
    130     ld1         {v0.8b, v1.8b}, [x0],#16
    131     st1         {v0.8b, v1.8b}, [x2],#16
    132 
    133 y_col_loop_end:
    134     ADD         x0, x0, x10
    135     ADD         x2, x2, x11
    136     SUBS        x4, x4, #1
    137     BGT         y_row_loop
    138 
    139 
    140     ///* Copy UV */
    141 uv_copy_start:
    142 
    143     LDR         w5, [sp,#96]                ////Load u2_dest_strideuv
    144     sxtw        x5,w5
    145     LDR         w7, [sp,#80]                ////Load u2_strideuv
    146     sxtw        x7,w7
    147 
    148     LSR         x9, x9, #1                  //// height/2
    149 //    MOV     x8,x8,LSR #1            @// Width/2
    150 
    151     SUB         x10,x7,x8                   //// Src UV increment
    152     LSR         x11, x8, #1
    153     SUB         x11,x5,x11                  //// Dst U and V increment
    154 
    155     mov         x5, x15                     ////Load pu1_dest_v
    156 
    157     LDR         w4, [sp,#104]               ////Load is_u_first_flag
    158     sxtw        x4,w4
    159     CMP         x4,#0                       ////Swap U and V dest if is_u_first_flag is zero
    160     csel        x4, x5, x4,EQ
    161     csel        x5, x3, x5,EQ
    162     csel        x3, x4, x3,EQ
    163 
    164     MOV         x4,x9                       //// Copying height
    165 uv_row_loop:
    166     MOV         x6,x8                       //// Copying width
    167 
    168 uv_col_loop:
    169 
    170     SUB         x6,x6,#16
    171 
    172     prfm        PLDL1KEEP,[x1,#128]
    173     ld2         {v0.8b, v1.8b},[x1],#16
    174     ST1         {v0.8b},[x3],#8
    175     ST1         {v1.8b},[x5],#8
    176     CMP         x6,#16
    177     BGE         uv_col_loop
    178     CMP         x6,#0
    179     BEQ         uv_col_loop_end
    180     ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
    181     ////Ex if width is 162, above loop will process 160 pixels. And
    182     ////Both source and destination will point to 146th pixel and then 16 bytes will be read
    183     //// and written using VLD1 and VST1
    184     sub         x20,x6,#16
    185     neg         x6, x20
    186     SUB         x1,x1,x6
    187     SUB         x3,x3,x6,LSR #1
    188     SUB         x5,x5,x6,LSR #1
    189     ld2         {v0.8b, v1.8b}, [x1],#16
    190     ST1         {v0.8b},[x3],#8
    191     ST1         {v1.8b},[x5],#8
    192 uv_col_loop_end:
    193     ADD         x1, x1, x10
    194     ADD         x3, x3, x11
    195     ADD         x5, x5, x11
    196     SUBS        x4, x4, #1
    197     BGT         uv_row_loop
    198 
    199 exit:
    200     // LDMFD sp!,{x4-x12, pc}
    201     ldp         x19, x20,[sp],#16
    202     pop_v_regs
    203     ret
    204 
    205 
    206 
    207 
    208 
    209 
    210