Home | History | Annotate | Download | only in arm
      1 /******************************************************************************
      2  *
      3  * Copyright (C) 2018 The Android Open Source Project
      4  *
      5  * Licensed under the Apache License, Version 2.0 (the "License");
      6  * you may not use this file except in compliance with the License.
      7  * You may obtain a copy of the License at:
      8  *
      9  * http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  *
     17  *****************************************************************************
     18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 */
     20 /**
     21 *******************************************************************************
     22 * @file
     23 *  ihevce_copy_neon.c
     24 *
     25 * @brief
     26 *  Contains intrinsic definitions of functions for block copy
     27 *
     28 * @author
     29 *  ittiam
     30 *
     31 * @par List of Functions:
     32 *  - ihevce_2d_square_copy_luma_neon()
     33 *  - ihevce_copy_2d_neon()
     34 *  - ihevce_chroma_interleave_2d_copy_neon()
     35 *
     36 * @remarks
     37 *  None
     38 *
     39 *******************************************************************************
     40 */
     41 
     42 /*****************************************************************************/
     43 /* File Includes                                                             */
     44 /*****************************************************************************/
     45 /* System include files */
     46 #include <string.h>
     47 #include <assert.h>
     48 #include <arm_neon.h>
     49 
     50 /* User include files */
     51 #include "ihevc_typedefs.h"
     52 #include "itt_video_api.h"
     53 #include "ihevc_platform_macros.h"
     54 
     55 #include "ihevce_cmn_utils_instr_set_router.h"
     56 
     57 /*****************************************************************************/
     58 /* Function Definitions                                                      */
     59 /*****************************************************************************/
     60 
     61 void ihevce_chroma_interleave_2d_copy_neon(
     62     UWORD8 *pu1_uv_src,
     63     WORD32 src_strd,
     64     UWORD8 *pu1_uv_dst,
     65     WORD32 dst_strd,
     66     WORD32 w,
     67     WORD32 h,
     68     CHROMA_PLANE_ID_T e_chroma_plane)
     69 {
     70     (void)h;
     71     assert(w == h);
     72     assert((e_chroma_plane == U_PLANE) || (e_chroma_plane == V_PLANE));
     73 
     74     if(w == 4)
     75     {
     76         uint16x4_t select = vdup_n_u16(0xff << (e_chroma_plane << 3));
     77 
     78         for(; w > 0; w--)
     79         {
     80             uint8x8_t src_0, dst_0;
     81 
     82             // row 0
     83             src_0 = vld1_u8(pu1_uv_src);
     84             dst_0 = vld1_u8(pu1_uv_dst);
     85             dst_0 = vbsl_u8(vreinterpret_u8_u16(select), src_0, dst_0);
     86             vst1_u8(pu1_uv_dst, dst_0);
     87             pu1_uv_src += src_strd;
     88             pu1_uv_dst += dst_strd;
     89         }
     90     }
     91     else
     92     {
     93         uint16x8_t select = vdupq_n_u16(0xff << (e_chroma_plane << 3));
     94         WORD32 i, j;
     95 
     96         assert(w % 8 == 0);
     97         for(j = 0; j < w; j += 1)
     98         {
     99             UWORD8 *dst_ol = pu1_uv_dst + j * dst_strd;
    100             UWORD8 *src_ol = pu1_uv_src + j * src_strd;
    101 
    102             for(i = 0; i < w; i += 8)
    103             {
    104                 UWORD8 *dst_il = dst_ol + (i * 2);
    105                 UWORD8 *src_il = src_ol + (i * 2);
    106                 uint8x16_t src_0, dst_0;
    107 
    108                 // row 0
    109                 src_0 = vld1q_u8(src_il);
    110                 dst_0 = vld1q_u8(dst_il);
    111                 dst_0 = vbslq_u8(vreinterpretq_u8_u16(select), src_0, dst_0);
    112                 vst1q_u8(dst_il, dst_0);
    113             }
    114         }
    115     }
    116 }
    117 
    118 static void copy_2d_neon(
    119     UWORD8 *pu1_dst, WORD32 dst_strd, UWORD8 *pu1_src, WORD32 src_strd, WORD32 blk_wd, WORD32 blk_ht)
    120 {
    121     assert(blk_wd == 4 || blk_wd == 8 || blk_wd == 16 || blk_wd == 32 || (blk_wd % 64 == 0));
    122 
    123     if(blk_wd == 4)
    124     {
    125         assert((blk_ht & 1) == 0);
    126         for(; blk_ht > 0; blk_ht -= 2)
    127         {
    128             // row 0
    129             *(uint32_t *)pu1_dst = *(const uint32_t *)pu1_src;
    130             pu1_src += src_strd;
    131             pu1_dst += dst_strd;
    132             // row 1
    133             *(uint32_t *)pu1_dst = *(const uint32_t *)pu1_src;
    134             pu1_src += src_strd;
    135             pu1_dst += dst_strd;
    136         }
    137     }
    138     else if(blk_wd == 8)
    139     {
    140         assert((blk_ht & 1) == 0);
    141         for(; blk_ht > 0; blk_ht -= 2)
    142         {
    143             uint8x8_t src_0, src_1;
    144 
    145             // row 0
    146             src_0 = vld1_u8(pu1_src);
    147             vst1_u8(pu1_dst, src_0);
    148             // row 1
    149             src_1 = vld1_u8(pu1_src + src_strd);
    150             vst1_u8(pu1_dst + dst_strd, src_1);
    151             pu1_src += 2 * src_strd;
    152             pu1_dst += 2 * dst_strd;
    153         }
    154     }
    155     else if(blk_wd == 16)
    156     {
    157         assert((blk_ht & 1) == 0);
    158         for(; blk_ht > 0; blk_ht -= 2)
    159         {
    160             uint8x16_t src_0, src_1;
    161 
    162             // row 0
    163             src_0 = vld1q_u8(pu1_src);
    164             vst1q_u8(pu1_dst, src_0);
    165             // row 1
    166             src_1 = vld1q_u8(pu1_src + src_strd);
    167             vst1q_u8(pu1_dst + dst_strd, src_1);
    168             pu1_src += 2 * src_strd;
    169             pu1_dst += 2 * dst_strd;
    170         }
    171     }
    172     else if(blk_wd == 32)
    173     {
    174         for(; blk_ht > 0; blk_ht--)
    175         {
    176             uint8x16_t src_0, src_1;
    177 
    178             // row 0
    179             src_0 = vld1q_u8(pu1_src);
    180             vst1q_u8(pu1_dst, src_0);
    181             src_1 = vld1q_u8(pu1_src + 16);
    182             vst1q_u8(pu1_dst + 16, src_1);
    183             pu1_src += src_strd;
    184             pu1_dst += dst_strd;
    185         }
    186     }
    187     else if(blk_wd % 64 == 0)
    188     {
    189         WORD32 i, j;
    190 
    191         for(j = 0; j < blk_ht; j += 1)
    192         {
    193             UWORD8 *dst_ol = pu1_dst + j * dst_strd;
    194             UWORD8 *src_ol = pu1_src + j * src_strd;
    195 
    196             for(i = 0; i < blk_wd; i += 64)
    197             {
    198                 uint8x16_t src_0, src_1, src_2, src_3;
    199                 UWORD8 *dst_il = dst_ol + i;
    200                 UWORD8 *src_il = src_ol + i;
    201 
    202                 src_0 = vld1q_u8(src_il);
    203                 vst1q_u8(dst_il, src_0);
    204                 src_1 = vld1q_u8(src_il + 16);
    205                 vst1q_u8(dst_il + 16, src_1);
    206                 src_2 = vld1q_u8(src_il + 32);
    207                 vst1q_u8(dst_il + 32, src_2);
    208                 src_3 = vld1q_u8(src_il + 48);
    209                 vst1q_u8(dst_il + 48, src_3);
    210             }
    211         }
    212     }
    213 }
    214 
    215 void ihevce_2d_square_copy_luma_neon(
    216     void *p_dst,
    217     WORD32 dst_strd,
    218     void *p_src,
    219     WORD32 src_strd,
    220     WORD32 num_cols_to_copy,
    221     WORD32 unit_size)
    222 {
    223     UWORD8 *pu1_dst = (UWORD8 *)p_dst;
    224     UWORD8 *pu1_src = (UWORD8 *)p_src;
    225 
    226     copy_2d_neon(
    227         pu1_dst,
    228         dst_strd * unit_size,
    229         pu1_src,
    230         src_strd * unit_size,
    231         num_cols_to_copy * unit_size,
    232         num_cols_to_copy);
    233 }
    234 
    235 void ihevce_copy_2d_neon(
    236     UWORD8 *pu1_dst, WORD32 dst_strd, UWORD8 *pu1_src, WORD32 src_strd, WORD32 blk_wd, WORD32 blk_ht)
    237 {
    238     if(blk_wd == 0)
    239         return;
    240 
    241     if(blk_wd > 64)
    242     {
    243         copy_2d_neon(pu1_dst, dst_strd, pu1_src, src_strd, 64, blk_ht);
    244         ihevce_copy_2d_neon(pu1_dst + 64, dst_strd, pu1_src + 64, src_strd, blk_wd - 64, blk_ht);
    245     }
    246     else if(blk_wd > 32)
    247     {
    248         copy_2d_neon(pu1_dst, dst_strd, pu1_src, src_strd, 32, blk_ht);
    249         ihevce_copy_2d_neon(pu1_dst + 32, dst_strd, pu1_src + 32, src_strd, blk_wd - 32, blk_ht);
    250     }
    251     else if(blk_wd >= 16)
    252     {
    253         if(blk_ht % 2 == 0)
    254         {
    255             copy_2d_neon(pu1_dst, dst_strd, pu1_src, src_strd, 16, blk_ht);
    256             ihevce_copy_2d_neon(
    257                 pu1_dst + 16, dst_strd, pu1_src + 16, src_strd, blk_wd - 16, blk_ht);
    258         }
    259         else
    260         {
    261             copy_2d_neon(pu1_dst, dst_strd, pu1_src, src_strd, 16, blk_ht - 1);
    262             memcpy(pu1_dst + (blk_ht - 1) * dst_strd, pu1_src + (blk_ht - 1) * src_strd, blk_wd);
    263             ihevce_copy_2d_neon(
    264                 pu1_dst + 16, dst_strd, pu1_src + 16, src_strd, blk_wd - 16, blk_ht - 1);
    265         }
    266     }
    267     else if(blk_wd >= 8)
    268     {
    269         if(blk_ht % 2 == 0)
    270         {
    271             copy_2d_neon(pu1_dst, dst_strd, pu1_src, src_strd, 8, blk_ht);
    272             ihevce_copy_2d_neon(pu1_dst + 8, dst_strd, pu1_src + 8, src_strd, blk_wd - 8, blk_ht);
    273         }
    274         else
    275         {
    276             copy_2d_neon(pu1_dst, dst_strd, pu1_src, src_strd, 8, blk_ht - 1);
    277             memcpy(pu1_dst + (blk_ht - 1) * dst_strd, pu1_src + (blk_ht - 1) * src_strd, blk_wd);
    278             ihevce_copy_2d_neon(
    279                 pu1_dst + 8, dst_strd, pu1_src + 8, src_strd, blk_wd - 8, blk_ht - 1);
    280         }
    281     }
    282     else if(blk_wd >= 4)
    283     {
    284         if(blk_ht % 2 == 0)
    285         {
    286             copy_2d_neon(pu1_dst, dst_strd, pu1_src, src_strd, 4, blk_ht);
    287             ihevce_copy_2d_neon(pu1_dst + 4, dst_strd, pu1_src + 4, src_strd, blk_wd - 4, blk_ht);
    288         }
    289         else
    290         {
    291             copy_2d_neon(pu1_dst, dst_strd, pu1_src, src_strd, 4, blk_ht - 1);
    292             memcpy(pu1_dst + (blk_ht - 1) * dst_strd, pu1_src + (blk_ht - 1) * src_strd, blk_wd);
    293             ihevce_copy_2d_neon(
    294                 pu1_dst + 4, dst_strd, pu1_src + 4, src_strd, blk_wd - 4, blk_ht - 1);
    295         }
    296     }
    297     else
    298     {
    299         ihevce_copy_2d(pu1_dst, dst_strd, pu1_src, src_strd, blk_wd, blk_ht);
    300     }
    301 }
    302