Home | History | Annotate | Download | only in arm
      1 /******************************************************************************
      2  *
      3  * Copyright (C) 2018 The Android Open Source Project
      4  *
      5  * Licensed under the Apache License, Version 2.0 (the "License");
      6  * you may not use this file except in compliance with the License.
      7  * You may obtain a copy of the License at:
      8  *
      9  * http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  *
     17  *****************************************************************************
     18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 */
     20 /**
     21 *******************************************************************************
     22 * @file
     23 *  ihevce_decomp_pre_intra_pass_neon.c
     24 *
     25 * @brief
     26 *  Contains functions to perform input scaling
     27 *
     28 * @author
     29 *  Ittiam
     30 *
     31 * @par List of Functions:
     32 *
     33 * @remarks
     34 *  None
     35 *
     36 ********************************************************************************
     37 */
     38 /*****************************************************************************/
     39 /* File Includes                                                             */
     40 /*****************************************************************************/
     41 /* System include files */
     42 #include <stdio.h>
     43 #include <string.h>
     44 #include <assert.h>
     45 #include <arm_neon.h>
     46 
     47 /* User include files */
     48 #include "ihevc_typedefs.h"
     49 #include "ihevc_macros.h"
     50 #include "ihevc_platform_macros.h"
     51 #include "itt_video_api.h"
     52 #include "ihevc_defs.h"
     53 #include "ihevc_cmn_utils_neon.h"
     54 #include "ihevce_ipe_instr_set_router.h"
     55 
     56 /*****************************************************************************/
     57 /* Function Definitions                                                      */
     58 /*****************************************************************************/
     59 void ihevce_scaling_filter_mxn(
     60     UWORD8 *pu1_src,
     61     WORD32 src_strd,
     62     UWORD8 *pu1_scrtch,
     63     WORD32 scrtch_strd,
     64     UWORD8 *pu1_dst,
     65     WORD32 dst_strd,
     66     WORD32 ht,
     67     WORD32 wd)
     68 {
     69 #define FILT_TAP_Q 8
     70 #define N_TAPS 7
     71     const WORD16 i4_ftaps[N_TAPS] = { -18, 0, 80, 132, 80, 0, -18 };
     72     WORD32 i, j;
     73     WORD32 tmp;
     74     UWORD8 *pu1_src_tmp = pu1_src - 3 * src_strd;
     75     UWORD8 *pu1_scrtch_tmp = pu1_scrtch;
     76 
     77     /* horizontal filtering */
     78     for(i = -3; i < ht + 2; i++)
     79     {
     80         for(j = 0; j < wd; j += 2)
     81         {
     82             tmp = (i4_ftaps[3] * pu1_src_tmp[j] +
     83                    i4_ftaps[2] * (pu1_src_tmp[j - 1] + pu1_src_tmp[j + 1]) +
     84                    i4_ftaps[1] * (pu1_src_tmp[j + 2] + pu1_src_tmp[j - 2]) +
     85                    i4_ftaps[0] * (pu1_src_tmp[j + 3] + pu1_src_tmp[j - 3]) +
     86                    (1 << (FILT_TAP_Q - 1))) >>
     87                   FILT_TAP_Q;
     88             pu1_scrtch_tmp[j >> 1] = CLIP_U8(tmp);
     89         }
     90         pu1_scrtch_tmp += scrtch_strd;
     91         pu1_src_tmp += src_strd;
     92     }
     93     /* vertical filtering */
     94     pu1_scrtch_tmp = pu1_scrtch + 3 * scrtch_strd;
     95     for(i = 0; i < ht; i += 2)
     96     {
     97         for(j = 0; j < (wd >> 1); j++)
     98         {
     99             tmp =
    100                 (i4_ftaps[3] * pu1_scrtch_tmp[j] +
    101                  i4_ftaps[2] * (pu1_scrtch_tmp[j + scrtch_strd] + pu1_scrtch_tmp[j - scrtch_strd]) +
    102                  i4_ftaps[1] *
    103                      (pu1_scrtch_tmp[j + 2 * scrtch_strd] + pu1_scrtch_tmp[j - 2 * scrtch_strd]) +
    104                  i4_ftaps[0] *
    105                      (pu1_scrtch_tmp[j + 3 * scrtch_strd] + pu1_scrtch_tmp[j - 3 * scrtch_strd]) +
    106                  (1 << (FILT_TAP_Q - 1))) >>
    107                 FILT_TAP_Q;
    108             pu1_dst[j] = CLIP_U8(tmp);
    109         }
    110         pu1_dst += dst_strd;
    111         pu1_scrtch_tmp += (scrtch_strd << 1);
    112     }
    113 }
    114 
    115 void ihevce_scale_by_2_neon(
    116     UWORD8 *pu1_src,
    117     WORD32 src_strd,
    118     UWORD8 *pu1_dst,
    119     WORD32 dst_strd,
    120     WORD32 wd,
    121     WORD32 ht,
    122     UWORD8 *pu1_wkg_mem,
    123     WORD32 ht_offset,
    124     WORD32 block_ht,
    125     WORD32 wd_offset,
    126     WORD32 block_wd,
    127     FT_COPY_2D *pf_copy_2d)
    128 {
    129 #define MAX_BLK_SZ (MAX_CTB_SIZE + ((N_TAPS >> 1) << 1))
    130     UWORD8 au1_cpy[MAX_BLK_SZ * MAX_BLK_SZ];
    131     UWORD32 cpy_strd = MAX_BLK_SZ;
    132     UWORD8 *pu1_cpy = au1_cpy + cpy_strd * (N_TAPS >> 1) + (N_TAPS >> 1);
    133 
    134     UWORD8 *pu1_in, *pu1_out;
    135     WORD32 in_strd, wkg_mem_strd;
    136 
    137     WORD32 row_start, row_end;
    138     WORD32 col_start, col_end;
    139     WORD32 i, fun_select;
    140     WORD32 ht_tmp, wd_tmp;
    141     FT_SCALING_FILTER_BY_2 *ihevce_scaling_filters[2];
    142 
    143     assert((wd & 1) == 0);
    144     assert((ht & 1) == 0);
    145     assert(block_wd <= MAX_CTB_SIZE);
    146     assert(block_ht <= MAX_CTB_SIZE);
    147 
    148     /* function pointers for filtering different dimensions */
    149     ihevce_scaling_filters[0] = ihevce_scaling_filter_mxn;
    150     ihevce_scaling_filters[1] = ihevce_scaling_filter_mxn_neon;
    151 
    152     /* handle boundary blks */
    153     col_start = (wd_offset < (N_TAPS >> 1)) ? 1 : 0;
    154     row_start = (ht_offset < (N_TAPS >> 1)) ? 1 : 0;
    155     col_end = ((wd_offset + block_wd) > (wd - (N_TAPS >> 1))) ? 1 : 0;
    156     row_end = ((ht_offset + block_ht) > (ht - (N_TAPS >> 1))) ? 1 : 0;
    157     if(col_end && (wd % block_wd != 0))
    158     {
    159         block_wd = (wd % block_wd);
    160     }
    161     if(row_end && (ht % block_ht != 0))
    162     {
    163         block_ht = (ht % block_ht);
    164     }
    165 
    166     /* boundary blks needs to be padded, copy src to tmp buffer */
    167     if(col_start || col_end || row_end || row_start)
    168     {
    169         UWORD8 *pu1_src_tmp = pu1_src + wd_offset + ht_offset * src_strd;
    170 
    171         pu1_cpy -= (3 * (1 - col_start) + cpy_strd * 3 * (1 - row_start));
    172         pu1_src_tmp -= (3 * (1 - col_start) + src_strd * 3 * (1 - row_start));
    173         ht_tmp = block_ht + 3 * (1 - row_start) + 3 * (1 - row_end);
    174         wd_tmp = block_wd + 3 * (1 - col_start) + 3 * (1 - col_end);
    175         pf_copy_2d(pu1_cpy, cpy_strd, pu1_src_tmp, src_strd, wd_tmp, ht_tmp);
    176         pu1_in = au1_cpy + cpy_strd * 3 + 3;
    177         in_strd = cpy_strd;
    178     }
    179     else
    180     {
    181         pu1_in = pu1_src + wd_offset + ht_offset * src_strd;
    182         in_strd = src_strd;
    183     }
    184 
    185     /*top padding*/
    186     if(row_start)
    187     {
    188         UWORD8 *pu1_cpy_tmp = au1_cpy + cpy_strd * 3;
    189 
    190         pu1_cpy = au1_cpy + cpy_strd * (3 - 1);
    191         memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
    192         pu1_cpy -= cpy_strd;
    193         memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
    194         pu1_cpy -= cpy_strd;
    195         memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
    196     }
    197 
    198     /*bottom padding*/
    199     if(row_end)
    200     {
    201         UWORD8 *pu1_cpy_tmp = au1_cpy + cpy_strd * 3 + (block_ht - 1) * cpy_strd;
    202 
    203         pu1_cpy = pu1_cpy_tmp + cpy_strd;
    204         memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
    205         pu1_cpy += cpy_strd;
    206         memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
    207         pu1_cpy += cpy_strd;
    208         memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
    209     }
    210 
    211     /*left padding*/
    212     if(col_start)
    213     {
    214         UWORD8 *pu1_cpy_tmp = au1_cpy + 3;
    215 
    216         pu1_cpy = au1_cpy;
    217         for(i = 0; i < block_ht + 6; i++)
    218         {
    219             pu1_cpy[0] = pu1_cpy[1] = pu1_cpy[2] = pu1_cpy_tmp[0];
    220             pu1_cpy += cpy_strd;
    221             pu1_cpy_tmp += cpy_strd;
    222         }
    223     }
    224 
    225     /*right padding*/
    226     if(col_end)
    227     {
    228         UWORD8 *pu1_cpy_tmp = au1_cpy + 3 + block_wd - 1;
    229 
    230         pu1_cpy = au1_cpy + 3 + block_wd;
    231         for(i = 0; i < block_ht + 6; i++)
    232         {
    233             pu1_cpy[0] = pu1_cpy[1] = pu1_cpy[2] = pu1_cpy_tmp[0];
    234             pu1_cpy += cpy_strd;
    235             pu1_cpy_tmp += cpy_strd;
    236         }
    237     }
    238 
    239     wkg_mem_strd = block_wd >> 1;
    240     pu1_out = pu1_dst + (wd_offset >> 1);
    241     fun_select = (block_wd % 16 == 0);
    242     ihevce_scaling_filters[fun_select](
    243         pu1_in, in_strd, pu1_wkg_mem, wkg_mem_strd, pu1_out, dst_strd, block_ht, block_wd);
    244 
    245     /* Left padding of 16 for 1st block of every row */
    246     if(wd_offset == 0)
    247     {
    248         UWORD8 u1_val;
    249         WORD32 pad_wd = 16;
    250         WORD32 pad_ht = block_ht >> 1;
    251         UWORD8 *dst = pu1_dst;
    252 
    253         for(i = 0; i < pad_ht; i++)
    254         {
    255             u1_val = dst[0];
    256             memset(&dst[-pad_wd], u1_val, pad_wd);
    257             dst += dst_strd;
    258         }
    259     }
    260 
    261     if(wd == wd_offset + block_wd)
    262     {
    263         /* Right padding of (16 + (CEIL16(wd/2))-wd/2) for last block of every row */
    264         /* Right padding is done only after processing of last block of that row is done*/
    265         UWORD8 u1_val;
    266         WORD32 pad_wd = 16 + CEIL16((wd >> 1)) - (wd >> 1) + 4;
    267         WORD32 pad_ht = block_ht >> 1;
    268         UWORD8 *dst = pu1_dst + (wd >> 1) - 1;
    269 
    270         for(i = 0; i < pad_ht; i++)
    271         {
    272             u1_val = dst[0];
    273             memset(&dst[1], u1_val, pad_wd);
    274             dst += dst_strd;
    275         }
    276 
    277         if(ht_offset == 0)
    278         {
    279             /* Top padding of 16 is done for 1st row only after we reach end of that row */
    280             WORD32 pad_wd = dst_strd;
    281             WORD32 pad_ht = 16;
    282             UWORD8 *dst = pu1_dst - 16;
    283 
    284             for(i = 1; i <= pad_ht; i++)
    285             {
    286                 memcpy(dst - (i * dst_strd), dst, pad_wd);
    287             }
    288         }
    289 
    290         /* Bottom padding of (16 + (CEIL16(ht/2)) - ht/2) is done only if we have
    291          reached end of frame */
    292         if(ht - ht_offset - block_ht == 0)
    293         {
    294             WORD32 pad_wd = dst_strd;
    295             WORD32 pad_ht = 16 + CEIL16((ht >> 1)) - (ht >> 1) + 4;
    296             UWORD8 *dst = pu1_dst + (((block_ht >> 1) - 1) * dst_strd) - 16;
    297 
    298             for(i = 1; i <= pad_ht; i++)
    299                 memcpy(dst + (i * dst_strd), dst, pad_wd);
    300         }
    301     }
    302 }
    303