Home | History | Annotate | Download | only in armv8
      1 //******************************************************************************
      2 //*
      3 //* Copyright (C) 2015 The Android Open Source Project
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************
     18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 //*/
     20 ///**
     21 // *******************************************************************************
     22 // * @file
     23 // *  ih264_mem_fns_neon.s
     24 // *
     25 // * @brief
     26 // *  Contains function definitions for memory manipulation
     27 // *
     28 // * @author
     29 // *     Naveen SR
     30 // *
     31 // * @par List of Functions:
     32 // *  - ih264_memcpy_av8()
     33 // *  - ih264_memcpy_mul_8_av8()
     34 // *  - ih264_memset_mul_8_av8()
     35 // *  - ih264_memset_16bit_mul_8_av8()
     36 // *  - ih264_memset_16bit_av8()
     37 // *
     38 // * @remarks
     39 // *  None
     40 // *
     41 // *******************************************************************************
     42 //*/
     43 
     44 .text
     45 .p2align 2
     46 .include "ih264_neon_macros.s"
     47 ///**
     48 //*******************************************************************************
     49 //*
     50 //* @brief
     51 //*   memcpy of a 1d array
     52 //*
     53 //* @par Description:
     54 //*   Does memcpy of 8bit data from source to destination for 8,16 or 32 number of bytes
     55 //*
     56 //* @param[in] pu1_dst
     57 //*  UWORD8 pointer to the destination
     58 //*
     59 //* @param[in] pu1_src
     60 //*  UWORD8 pointer to the source
     61 //*
     62 //* @param[in] num_bytes
     63 //*  number of bytes to copy
     64 //* @returns
     65 //*
     66 //* @remarks
     67 //*  None
     68 //*
     69 //*******************************************************************************
     70 //*/
     71 //void ih264_memcpy_mul_8(UWORD8 *pu1_dst,
     72 //                      UWORD8 *pu1_src,
     73 //                      UWORD8 num_bytes)
     74 //**************Variables Vs Registers*************************
     75 //    x0 => *pu1_dst
     76 //    x1 => *pu1_src
     77 //    x2 => num_bytes
     78 
     79 
     80 
     81 
     82 
     83     .global ih264_memcpy_mul_8_av8
     84 
     85 ih264_memcpy_mul_8_av8:
     86 
     87 loop_neon_memcpy_mul_8:
     88     // Memcpy 8 bytes
     89     ld1       {v0.8b}, [x1], #8
     90     st1       {v0.8b}, [x0], #8
     91 
     92     subs      x2, x2, #8
     93     bne       loop_neon_memcpy_mul_8
     94     ret
     95 
     96 
     97 
     98 //*******************************************************************************
     99 //*/
    100 //void ih264_memcpy(UWORD8 *pu1_dst,
    101 //                  UWORD8 *pu1_src,
    102 //                  UWORD8 num_bytes)
    103 //**************Variables Vs Registers*************************
    104 //    x0 => *pu1_dst
    105 //    x1 => *pu1_src
    106 //    x2 => num_bytes
    107 
    108 
    109 
    110     .global ih264_memcpy_av8
    111 
    112 ih264_memcpy_av8:
    113     subs      x2, x2, #8
    114     blt       arm_memcpy
    115 loop_neon_memcpy:
    116     // Memcpy 8 bytes
    117     ld1       {v0.8b}, [x1], #8
    118     st1       {v0.8b}, [x0], #8
    119 
    120     subs      x2, x2, #8
    121     bge       loop_neon_memcpy
    122     cmn       x2, #8
    123     beq       end_func1
    124 
    125 arm_memcpy:
    126     add       x2, x2, #8
    127 
    128 loop_arm_memcpy:
    129     ldrb      w3, [x1], #1
    130     sxtw      x3, w3
    131     strb      w3, [x0], #1
    132     sxtw      x3, w3
    133     subs      x2, x2, #1
    134     bne       loop_arm_memcpy
    135     ret
    136 end_func1:
    137     ret
    138 
    139 
    140 //void ih264_memset_mul_8(UWORD8 *pu1_dst,
    141 //                       UWORD8 value,
    142 //                       UWORD8 num_bytes)
    143 //**************Variables Vs Registers*************************
    144 //    x0 => *pu1_dst
    145 //    x1 => value
    146 //    x2 => num_bytes
    147 
    148 
    149     .global ih264_memset_mul_8_av8
    150 
    151 ih264_memset_mul_8_av8:
    152 
    153 // Assumptions: numbytes is either 8, 16 or 32
    154     dup       v0.8b, w1
    155 loop_memset_mul_8:
    156     // Memset 8 bytes
    157     st1       {v0.8b}, [x0], #8
    158 
    159     subs      x2, x2, #8
    160     bne       loop_memset_mul_8
    161 
    162     ret
    163 
    164 
    165 //void ih264_memset(UWORD8 *pu1_dst,
    166 //                       UWORD8 value,
    167 //                       UWORD8 num_bytes)
    168 //**************Variables Vs Registers*************************
    169 //    x0 => *pu1_dst
    170 //    x1 => value
    171 //    x2 => num_bytes
    172 
    173 
    174 
    175     .global ih264_memset_av8
    176 
    177 ih264_memset_av8:
    178     subs      x2, x2, #8
    179     blt       arm_memset
    180     dup       v0.8b, w1
    181 loop_neon_memset:
    182     // Memcpy 8 bytes
    183     st1       {v0.8b}, [x0], #8
    184 
    185     subs      x2, x2, #8
    186     bge       loop_neon_memset
    187     cmn       x2, #8
    188     beq       end_func2
    189 
    190 arm_memset:
    191     add       x2, x2, #8
    192 
    193 loop_arm_memset:
    194     strb      w1, [x0], #1
    195     sxtw      x1, w1
    196     subs      x2, x2, #1
    197     bne       loop_arm_memset
    198     ret
    199 end_func2:
    200     ret
    201 
    202 
    203 
    204 
    205 
    206 //void ih264_memset_16bit_mul_8(UWORD16 *pu2_dst,
    207 //                                      UWORD16 value,
    208 //                                      UWORD8 num_words)
    209 //**************Variables Vs Registers*************************
    210 //    x0 => *pu2_dst
    211 //    x1 => value
    212 //    x2 => num_words
    213 
    214 
    215     .global ih264_memset_16bit_mul_8_av8
    216 
    217 ih264_memset_16bit_mul_8_av8:
    218 
    219 // Assumptions: num_words is either 8, 16 or 32
    220 
    221     // Memset 8 words
    222     dup       v0.4h, w1
    223 loop_memset_16bit_mul_8:
    224     st1       {v0.4h}, [x0], #8
    225     st1       {v0.4h}, [x0], #8
    226 
    227     subs      x2, x2, #8
    228     bne       loop_memset_16bit_mul_8
    229 
    230     ret
    231 
    232 
    233 
    234 //void ih264_memset_16bit(UWORD16 *pu2_dst,
    235 //                       UWORD16 value,
    236 //                       UWORD8 num_words)
    237 //**************Variables Vs Registers*************************
    238 //    x0 => *pu2_dst
    239 //    x1 => value
    240 //    x2 => num_words
    241 
    242 
    243 
    244     .global ih264_memset_16bit_av8
    245 
    246 ih264_memset_16bit_av8:
    247     subs      x2, x2, #8
    248     blt       arm_memset_16bit
    249     dup       v0.4h, w1
    250 loop_neon_memset_16bit:
    251     // Memset 8 words
    252     st1       {v0.4h}, [x0], #8
    253     st1       {v0.4h}, [x0], #8
    254 
    255     subs      x2, x2, #8
    256     bge       loop_neon_memset_16bit
    257     cmn       x2, #8
    258     beq       end_func3
    259 
    260 arm_memset_16bit:
    261     add       x2, x2, #8
    262 
    263 loop_arm_memset_16bit:
    264     strh      w1, [x0], #2
    265     sxtw      x1, w1
    266     subs      x2, x2, #1
    267     bne       loop_arm_memset_16bit
    268     ret
    269 
    270 end_func3:
    271     ret
    272 
    273 
    274 
    275