Home | History | Annotate | Download | only in armv8
      1 //******************************************************************************
      2 //*
      3 //* Copyright (C) 2015 The Android Open Source Project
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************
     18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 //*/
     20 ///**
     21 // *******************************************************************************
     22 // * @file
     23 // *  ih264_mem_fns_neon.s
     24 // *
     25 // * @brief
     26 // *  Contains function definitions for memory manipulation
     27 // *
     28 // * @author
     29 // *     Naveen SR
     30 // *
     31 // * @par List of Functions:
     32 // *  - ih264_memcpy_av8()
     33 // *  - ih264_memcpy_mul_8_av8()
     34 // *  - ih264_memset_mul_8_av8()
     35 // *  - ih264_memset_16bit_mul_8_av8()
     36 // *  - ih264_memset_16bit_av8()
     37 // *
     38 // * @remarks
     39 // *  None
     40 // *
     41 // *******************************************************************************
     42 //*/
     43 
     44 .text
     45 .p2align 2
     46 .include "ih264_neon_macros.s"
     47 ///**
     48 //*******************************************************************************
     49 //*
     50 //* @brief
     51 //*   memcpy of a 1d array
     52 //*
     53 //* @par Description:
     54 //*   Does memcpy of 8bit data from source to destination for 8,16 or 32 number of bytes
     55 //*
     56 //* @param[in] pu1_dst
     57 //*  UWORD8 pointer to the destination
     58 //*
     59 //* @param[in] pu1_src
     60 //*  UWORD8 pointer to the source
     61 //*
     62 //* @param[in] num_bytes
     63 //*  number of bytes to copy
     64 //* @returns
     65 //*
     66 //* @remarks
     67 //*  None
     68 //*
     69 //*******************************************************************************
     70 //*/
     71 //void ih264_memcpy_mul_8(UWORD8 *pu1_dst,
     72 //                      UWORD8 *pu1_src,
     73 //                      UWORD32 num_bytes)
     74 //**************Variables Vs Registers*************************
     75 //    x0 => *pu1_dst
     76 //    x1 => *pu1_src
     77 //    w2 => num_bytes
     78 
     79 
     80 
     81 
     82 
     83     .global ih264_memcpy_mul_8_av8
     84 
     85 ih264_memcpy_mul_8_av8:
     86 
     87 loop_neon_memcpy_mul_8:
     88     // Memcpy 8 bytes
     89     ld1       {v0.8b}, [x1], #8
     90     st1       {v0.8b}, [x0], #8
     91 
     92     subs      w2, w2, #8
     93     bne       loop_neon_memcpy_mul_8
     94     ret
     95 
     96 
     97 
     98 //*******************************************************************************
     99 //*/
    100 //void ih264_memcpy(UWORD8 *pu1_dst,
    101 //                  UWORD8 *pu1_src,
    102 //                  UWORD32 num_bytes)
    103 //**************Variables Vs Registers*************************
    104 //    x0 => *pu1_dst
    105 //    x1 => *pu1_src
    106 //    w2 => num_bytes
    107 
    108 
    109 
    110     .global ih264_memcpy_av8
    111 
    112 ih264_memcpy_av8:
    113     subs      w2, w2, #8
    114     blt       arm_memcpy
    115 loop_neon_memcpy:
    116     // Memcpy 8 bytes
    117     ld1       {v0.8b}, [x1], #8
    118     st1       {v0.8b}, [x0], #8
    119 
    120     subs      w2, w2, #8
    121     bge       loop_neon_memcpy
    122     cmn       w2, #8
    123     beq       end_func1
    124 
    125 arm_memcpy:
    126     add       w2, w2, #8
    127 
    128 loop_arm_memcpy:
    129     ldrb      w3, [x1], #1
    130     strb      w3, [x0], #1
    131     subs      w2, w2, #1
    132     bne       loop_arm_memcpy
    133     ret
    134 end_func1:
    135     ret
    136 
    137 
    138 //void ih264_memset_mul_8(UWORD8 *pu1_dst,
    139 //                       UWORD8 value,
    140 //                       UWORD32 num_bytes)
    141 //**************Variables Vs Registers*************************
    142 //    x0 => *pu1_dst
    143 //    x1 => value
    144 //    x2 => num_bytes
    145 
    146 
    147     .global ih264_memset_mul_8_av8
    148 
    149 ih264_memset_mul_8_av8:
    150 
    151 // Assumptions: numbytes is either 8, 16 or 32
    152     dup       v0.8b, w1
    153 loop_memset_mul_8:
    154     // Memset 8 bytes
    155     st1       {v0.8b}, [x0], #8
    156 
    157     subs      w2, w2, #8
    158     bne       loop_memset_mul_8
    159 
    160     ret
    161 
    162 
    163 //void ih264_memset(UWORD8 *pu1_dst,
    164 //                       UWORD8 value,
    165 //                       UWORD32 num_bytes)
    166 //**************Variables Vs Registers*************************
    167 //    x0 => *pu1_dst
    168 //    w1 => value
    169 //    w2 => num_bytes
    170 
    171 
    172 
    173     .global ih264_memset_av8
    174 
    175 ih264_memset_av8:
    176     subs      w2, w2, #8
    177     blt       arm_memset
    178     dup       v0.8b, w1
    179 loop_neon_memset:
    180     // Memcpy 8 bytes
    181     st1       {v0.8b}, [x0], #8
    182 
    183     subs      w2, w2, #8
    184     bge       loop_neon_memset
    185     cmn       w2, #8
    186     beq       end_func2
    187 
    188 arm_memset:
    189     add       w2, w2, #8
    190 
    191 loop_arm_memset:
    192     strb      w1, [x0], #1
    193     subs      w2, w2, #1
    194     bne       loop_arm_memset
    195     ret
    196 end_func2:
    197     ret
    198 
    199 
    200 
    201 
    202 
    203 //void ih264_memset_16bit_mul_8(UWORD16 *pu2_dst,
    204 //                                      UWORD16 value,
    205 //                                      UWORD32 num_words)
    206 //**************Variables Vs Registers*************************
    207 //    x0 => *pu2_dst
    208 //    w1 => value
    209 //    w2 => num_words
    210 
    211 
    212     .global ih264_memset_16bit_mul_8_av8
    213 
    214 ih264_memset_16bit_mul_8_av8:
    215 
    216 // Assumptions: num_words is either 8, 16 or 32
    217 
    218     // Memset 8 words
    219     dup       v0.4h, w1
    220 loop_memset_16bit_mul_8:
    221     st1       {v0.4h}, [x0], #8
    222     st1       {v0.4h}, [x0], #8
    223 
    224     subs      w2, w2, #8
    225     bne       loop_memset_16bit_mul_8
    226 
    227     ret
    228 
    229 
    230 
    231 //void ih264_memset_16bit(UWORD16 *pu2_dst,
    232 //                       UWORD16 value,
    233 //                       UWORD32 num_words)
    234 //**************Variables Vs Registers*************************
    235 //    x0 => *pu2_dst
    236 //    w1 => value
    237 //    w2 => num_words
    238 
    239 
    240 
    241     .global ih264_memset_16bit_av8
    242 
    243 ih264_memset_16bit_av8:
    244     subs      w2, w2, #8
    245     blt       arm_memset_16bit
    246     dup       v0.4h, w1
    247 loop_neon_memset_16bit:
    248     // Memset 8 words
    249     st1       {v0.4h}, [x0], #8
    250     st1       {v0.4h}, [x0], #8
    251 
    252     subs      w2, w2, #8
    253     bge       loop_neon_memset_16bit
    254     cmn       w2, #8
    255     beq       end_func3
    256 
    257 arm_memset_16bit:
    258     add       w2, w2, #8
    259 
    260 loop_arm_memset_16bit:
    261     strh      w1, [x0], #2
    262     subs      w2, w2, #1
    263     bne       loop_arm_memset_16bit
    264     ret
    265 
    266 end_func3:
    267     ret
    268 
    269 
    270 
    271