Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 ///*******************************************************************************
     20 //* //file
     21 //*  ihevcd_fmt_conv_420sp_to_420sp.s
     22 //*
     23 //* //brief
     24 //*  contains function definitions for format conversions
     25 //*
     26 //* //author
     27 //*  ittiam
     28 //*
     29 //* //par list of functions:
     30 //*
     31 //*
     32 //* //remarks
     33 //*  none
     34 //*
     35 //*******************************************************************************/
     36     .equ DO1STROUNDING, 0
     37 
     38     // ARM
     39     //
     40     // PRESERVE8
     41 
     42 .text
     43 .p2align 2
     44 
     45 .include "ihevc_neon_macros.s"
     46 
     47 
     48 
     49 
     50 ///*****************************************************************************
     51 //*                                                                            *
     52 //*  Function Name    : ihevcd_fmt_conv_420sp_to_420sp()                       *
     53 //*                                                                            *
     54 //*  Description      : This function conversts the image from YUV420SP color  *
     55 //*                     space to 420SP color space(UV interleaved).                 *
     56 //*                                                                            *
     57 //*  Arguments        : x0           pu1_y                                     *
     58 //*                     x1           pu1_uv                                    *
     59 //*                     x2           pu1_dest_y                                *
     60 //*                     x3           pu1_dest_uv                               *
     61 //*                     [x13 #40]    u2_width                                  *
     62 //*                     [x13 #44]    u2_height                                 *
     63 //*                     [x13 #48]    u2_stridey                                *
     64 //*                     [x13 #52]    u2_stridechroma                           *
     65 //*                     [x13 #56]    u2_dest_stridey                           *
     66 //*                     [x13 #60]    u2_dest_stridechroma                      *
     67 //*                                                                            *
     68 //*  Values Returned  : None                                                   *
     69 //*                                                                            *
     70 //*  Register Usage   : x0 - x14                                               *
     71 //*                                                                            *
     72 //*  Stack Usage      : 40 Bytes                                               *
     73 //*                                                                            *
     74 //*  Interruptibility : Interruptible                                          *
     75 //*                                                                            *
     76 //*  Known Limitations                                                         *
     77 //*       Assumptions: Image Width:     Assumed to be multiple of 2 and       *
     78 //*                     Image Height:    Assumed to be even.                   *
     79 //*                                                                            *
     80 //*  Revision History :                                                        *
     81 //*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
     82 //*         16 05 2012   Naveen SR     draft                                     *
     83 //*                                                                            *
     84 //*****************************************************************************/
     85 
     86     .global ihevcd_fmt_conv_420sp_to_420sp_av8
     87 .type ihevcd_fmt_conv_420sp_to_420sp_a9q, %function
     88 ihevcd_fmt_conv_420sp_to_420sp_av8:
     89 
     90     // STMFD sp!,{x4-x12, x14}
     91     push_v_regs
     92     stp         x19, x20,[sp,#-16]!
     93 
     94     mov         x8, x4                      ////Load u2_width
     95     mov         x9, x5                      ////Load u2_height
     96 
     97     LDR         w5, [sp,#80]                ////Load u2_dest_stridey
     98     sxtw        x5,w5
     99 
    100     mov         x7, x6                      ////Load u2_stridey
    101 
    102     SUB         x10,x7,x8                   //// Src Y increment
    103     SUB         x11,x5,x8                   //// Dst Y increment
    104 
    105     ///* Copy Y */
    106 
    107     MOV         x4,x9                       //// Copying height
    108 y_row_loop:
    109     MOV         x6,x8                       //// Copying width
    110 
    111 y_col_loop:
    112     prfm        PLDL1KEEP,[x0, #128]
    113     SUB         x6,x6,#32
    114     LD1         {v0.8b},[x0],#8
    115     LD1         {v1.8b},[x0],#8
    116     LD1         {v2.8b},[x0],#8
    117     LD1         {v3.8b},[x0],#8
    118     ST1         {v0.8b},[x2],#8
    119     ST1         {v1.8b},[x2],#8
    120     ST1         {v2.8b},[x2],#8
    121     ST1         {v3.8b},[x2],#8
    122     CMP         x6,#32
    123     BGE         y_col_loop
    124     CMP         x6,#0
    125     BEQ         y_col_loop_end
    126     ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
    127     ////Ex if width is 162, above loop will process 160 pixels. And
    128     ////Both source and destination will point to 146th pixel and then 16 bytes will be read
    129     //// and written using VLD1 and VST1
    130     sub         x20,x6,#32
    131     neg         x6, x20
    132     SUB         x0,x0,x6
    133     SUB         x2,x2,x6
    134     LD1         {v0.8b},[x0],#8
    135     LD1         {v1.8b},[x0],#8
    136     LD1         {v2.8b},[x0],#8
    137     LD1         {v3.8b},[x0],#8
    138     ST1         {v0.8b},[x2],#8
    139     ST1         {v1.8b},[x2],#8
    140     ST1         {v2.8b},[x2],#8
    141     ST1         {v3.8b},[x2],#8
    142 
    143 y_col_loop_end:
    144     ADD         x0, x0, x10
    145     ADD         x2, x2, x11
    146     SUBS        x4, x4, #1
    147     BGT         y_row_loop
    148 
    149 
    150 
    151     ///* Copy UV */
    152 
    153     LDR         w5, [sp,#88]                ////Load u2_dest_stridechroma
    154     sxtw        x5,w5
    155 
    156     LSR         x9, x9, #1                  //// height/2
    157 //    MOV     x8,x8,LSR #1            @// Width/2
    158 
    159     MOV         x2,x3                       //pu1_dest_uv
    160 
    161     SUB         x10,x7,x8                   //// Src UV increment
    162     SUB         x11,x5,x8                   //// Dst UV increment
    163 
    164     MOV         x4,x9                       //// Copying height
    165 uv_row_loop:
    166     MOV         x6,x8                       //// Copying width
    167 
    168 uv_col_loop:
    169 
    170     prfm        PLDL1KEEP,[x1, #128]
    171     SUB         x6,x6,#16
    172     LD1         {v0.8b},[x1],#8
    173     LD1         {v1.8b},[x1],#8
    174     ST1         {v0.8b},[x2],#8
    175     ST1         {v1.8b},[x2],#8
    176     CMP         x6,#16
    177     BGE         uv_col_loop
    178     CMP         x6,#0
    179     BEQ         u_col_loop_end
    180     ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
    181     ////Ex if width is 162, above loop will process 160 pixels. And
    182     ////Both source and destination will point to 146th pixel and then 16 bytes will be read
    183     //// and written using VLD1 and VST1
    184     sub         x20,x6,#16
    185     neg         x6, x20
    186     SUB         x1,x1,x6
    187     SUB         x2,x2,x6
    188     LD1         {v0.8b},[x1],#8
    189     LD1         {v1.8b},[x1],#8
    190     ST1         {v0.8b},[x2],#8
    191     ST1         {v1.8b},[x2],#8
    192 
    193 u_col_loop_end:
    194     ADD         x1, x1, x10
    195     ADD         x2, x2, x11
    196     SUBS        x4, x4, #1
    197     BGT         uv_row_loop
    198 
    199 exit:
    200     // LDMFD sp!,{x4-x12, pc}
    201     ldp         x19, x20,[sp],#16
    202     pop_v_regs
    203     ret
    204 
    205 
    206     .section .note.GNU-stack,"",%progbits
    207 
    208