Home | History | Annotate | Download | only in x86
      1 /******************************************************************************
      2 *
      3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 *
      5 * Licensed under the Apache License, Version 2.0 (the "License");
      6 * you may not use this file except in compliance with the License.
      7 * You may obtain a copy of the License at:
      8 *
      9 * http://www.apache.org/licenses/LICENSE-2.0
     10 *
     11 * Unless required by applicable law or agreed to in writing, software
     12 * distributed under the License is distributed on an "AS IS" BASIS,
     13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 * See the License for the specific language governing permissions and
     15 * limitations under the License.
     16 *
     17 ******************************************************************************/
     18 /**
     19 *******************************************************************************
     20 * @file
     21 *  ihevcd_frm_cvt_x86_intr.c
     22 *
     23 * @brief
     24 *  Platform specific intrinsic implementation of certain functions
     25 *
     26 * @author
     27 *  Ittiam
     28 * @par List of Functions:
     29 *  - ihevcd_itrans_recon_dc
     30 *  - ihevcd_fmt_conv_420sp_to_420p
     31 *
     32 * @remarks
     33 *  None
     34 *
     35 *******************************************************************************
     36 */
     37 #include "string.h"
     38 #include "ihevc_typedefs.h"
     39 #include "ihevc_defs.h"
     40 #include "ihevc_macros.h"
     41 #include "ihevc_platform_macros.h"
     42 #include "ihevcd_function_selector.h"
     43 #include <string.h>
     44 #include <immintrin.h>
     45 
     46 
     47 void ihevcd_fmt_conv_420sp_to_420p_ssse3(UWORD8 *pu1_y_src,
     48                                          UWORD8 *pu1_uv_src,
     49                                          UWORD8 *pu1_y_dst,
     50                                          UWORD8 *pu1_u_dst,
     51                                          UWORD8 *pu1_v_dst,
     52                                          WORD32 wd,
     53                                          WORD32 ht,
     54                                          WORD32 src_y_strd,
     55                                          WORD32 src_uv_strd,
     56                                          WORD32 dst_y_strd,
     57                                          WORD32 dst_uv_strd,
     58                                          WORD32 is_u_first,
     59                                          WORD32 disable_luma_copy)
     60 {
     61     UWORD8 *pu1_src, *pu1_dst;
     62     UWORD8 *pu1_u_src, *pu1_v_src;
     63     WORD32 num_rows, num_cols, src_strd, dst_strd, cols, rows;
     64     WORD32 i, j;
     65 
     66     cols = 0;
     67     pu1_u_src = (UWORD8 *)pu1_uv_src;
     68     pu1_v_src = (UWORD8 *)pu1_uv_src + 1;
     69     if(0 == disable_luma_copy)
     70     {
     71         /* copy luma */
     72         pu1_src = (UWORD8 *)pu1_y_src;
     73         pu1_dst = (UWORD8 *)pu1_y_dst;
     74 
     75         num_rows = ht;
     76         num_cols = wd;
     77 
     78         src_strd = src_y_strd;
     79         dst_strd = dst_y_strd;
     80         for(i = 0; i < num_rows; i++)
     81         {
     82             memcpy(pu1_dst, pu1_src, num_cols);
     83             pu1_dst += dst_strd;
     84             pu1_src += src_strd;
     85         }
     86     }
     87 
     88     /* de-interleave U and V and copy to destination */
     89     if(!is_u_first)
     90     {
     91         UWORD8 *temp = pu1_u_dst;
     92         pu1_u_dst = pu1_v_dst;
     93         pu1_v_dst = temp;
     94 
     95         pu1_u_src = (UWORD8 *)pu1_uv_src + 1;
     96         pu1_v_src = (UWORD8 *)pu1_uv_src;
     97     }
     98 
     99     {
    100         __m128i src_uv0_8x16b, src_uv1_8x16b, src_u_8x16b, src_v_8x16b;
    101         __m128i temp0_8x16b, temp1_8x16b, alt_first_mask;
    102 
    103         UWORD8 FIRST_ALT_SHUFFLE[16] = {
    104             0x00, 0x02, 0x04, 0x06,
    105             0x08, 0x0A, 0x0C, 0x0E,
    106             0x01, 0x03, 0x05, 0x07,
    107             0x09, 0x0B, 0x0D, 0x0F };
    108 
    109         PREFETCH((char const *)(pu1_uv_src + (0 * src_uv_strd)), _MM_HINT_T0)
    110         PREFETCH((char const *)(pu1_uv_src + (1 * src_uv_strd)), _MM_HINT_T0)
    111         PREFETCH((char const *)(pu1_uv_src + (2 * src_uv_strd)), _MM_HINT_T0)
    112         PREFETCH((char const *)(pu1_uv_src + (3 * src_uv_strd)), _MM_HINT_T0)
    113         PREFETCH((char const *)(pu1_uv_src + (4 * src_uv_strd)), _MM_HINT_T0)
    114         PREFETCH((char const *)(pu1_uv_src + (5 * src_uv_strd)), _MM_HINT_T0)
    115         PREFETCH((char const *)(pu1_uv_src + (6 * src_uv_strd)), _MM_HINT_T0)
    116         PREFETCH((char const *)(pu1_uv_src + (7 * src_uv_strd)), _MM_HINT_T0)
    117 
    118         num_rows = ht >> 1;
    119         num_cols = wd >> 1;
    120 
    121         src_strd = src_uv_strd;
    122         dst_strd = dst_uv_strd;
    123 
    124         alt_first_mask = _mm_loadu_si128((__m128i *)&FIRST_ALT_SHUFFLE[0]);
    125 
    126         if(num_cols > 15)
    127         {
    128             cols = num_cols >> 4;
    129 
    130             for(i = 0; i < (num_rows >> 2); i++)
    131             {
    132                 UWORD8 *pu1_uv_src_temp, *pu1_u_dst_temp, *pu1_v_dst_temp;
    133 
    134                 PREFETCH((char const *)(pu1_uv_src + (8 * src_strd)), _MM_HINT_T0)
    135                 PREFETCH((char const *)(pu1_uv_src + (9 * src_strd)), _MM_HINT_T0)
    136                 PREFETCH((char const *)(pu1_uv_src + (10 * src_strd)), _MM_HINT_T0)
    137                 PREFETCH((char const *)(pu1_uv_src + (11 * src_strd)), _MM_HINT_T0)
    138 
    139                 pu1_uv_src_temp = pu1_uv_src;
    140                 pu1_u_dst_temp =  pu1_u_dst;
    141                 pu1_v_dst_temp =  pu1_v_dst;
    142 
    143                 for(j = 0; j < cols; j++)
    144                 {
    145 
    146                     /**** Row 0 ***/
    147                     src_uv0_8x16b = _mm_loadu_si128((__m128i *)pu1_uv_src_temp);
    148                     src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + 16));
    149 
    150                     temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask);
    151                     temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask);
    152 
    153                     src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b);
    154                     src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b);
    155 
    156                     _mm_storeu_si128((__m128i *)(pu1_u_dst_temp), src_u_8x16b);
    157                     _mm_storeu_si128((__m128i *)(pu1_v_dst_temp), src_v_8x16b);
    158 
    159                     /**** Row 1 ***/
    160                     src_uv0_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (1 * src_strd)));
    161                     src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (1 * src_strd) + 16));
    162 
    163                     temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask);
    164                     temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask);
    165 
    166                     src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b);
    167                     src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b);
    168 
    169                     _mm_storeu_si128((__m128i *)(pu1_u_dst_temp + (1 * dst_strd)), src_u_8x16b);
    170                     _mm_storeu_si128((__m128i *)(pu1_v_dst_temp + (1 * dst_strd)), src_v_8x16b);
    171 
    172                     /**** Row 2 ***/
    173                     src_uv0_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (2 * src_strd)));
    174                     src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (2 * src_strd) + 16));
    175 
    176                     temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask);
    177                     temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask);
    178 
    179                     src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b);
    180                     src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b);
    181 
    182                     _mm_storeu_si128((__m128i *)(pu1_u_dst_temp + (2 * dst_strd)), src_u_8x16b);
    183                     _mm_storeu_si128((__m128i *)(pu1_v_dst_temp + (2 * dst_strd)), src_v_8x16b);
    184 
    185                     /**** Row 3 ***/
    186                     src_uv0_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (3 * src_strd)));
    187                     src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (3 * src_strd) + 16));
    188 
    189                     temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask);
    190                     temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask);
    191 
    192                     src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b);
    193                     src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b);
    194 
    195                     _mm_storeu_si128((__m128i *)(pu1_u_dst_temp + (3 * dst_strd)), src_u_8x16b);
    196                     _mm_storeu_si128((__m128i *)(pu1_v_dst_temp + (3 * dst_strd)), src_v_8x16b);
    197 
    198                     pu1_u_dst_temp += 16;
    199                     pu1_v_dst_temp += 16;
    200                     pu1_uv_src_temp += 32;
    201                 }
    202 
    203                 pu1_u_dst += 4 * dst_strd;
    204                 pu1_v_dst += 4 * dst_strd;
    205                 pu1_uv_src += 4 * src_strd;
    206                 //pu1_v_src += src_strd;
    207             }
    208             rows = num_rows & 0x3;
    209             if(rows)
    210             {
    211                 for(i = 0; i < rows; i++)
    212                 {
    213                     UWORD8 *pu1_uv_src_temp, *pu1_u_dst_temp, *pu1_v_dst_temp;
    214 
    215                     pu1_uv_src_temp = pu1_uv_src;
    216                     pu1_u_dst_temp =  pu1_u_dst;
    217                     pu1_v_dst_temp =  pu1_v_dst;
    218 
    219                     for(j = 0; j < cols; j++)
    220                     {
    221 
    222                         src_uv0_8x16b = _mm_loadu_si128((__m128i *)pu1_uv_src_temp);
    223                         src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + 16));
    224 
    225                         temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask);
    226                         temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask);
    227 
    228                         src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b);
    229                         src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b);
    230 
    231                         _mm_storeu_si128((__m128i *)(pu1_u_dst_temp), src_u_8x16b);
    232                         _mm_storeu_si128((__m128i *)(pu1_v_dst_temp), src_v_8x16b);
    233 
    234                         pu1_u_dst_temp += 16;
    235                         pu1_v_dst_temp += 16;
    236                         pu1_uv_src_temp += 32;
    237                     }
    238 
    239                     pu1_u_dst += dst_strd;
    240                     pu1_v_dst += dst_strd;
    241                     pu1_uv_src += src_strd;
    242                 }
    243             }
    244             pu1_u_dst -= (num_rows * dst_strd);
    245             pu1_v_dst -= (num_rows * dst_strd);
    246             num_cols &= 0x0F;
    247         }
    248         if(num_cols)
    249         {
    250             pu1_u_dst += (cols << 4);
    251             pu1_v_dst += (cols << 4);
    252             pu1_u_src += 2 * (cols << 4);
    253             pu1_v_src += 2 * (cols << 4);
    254             for(i = 0; i < num_rows; i++)
    255             {
    256                 for(j = 0; j < num_cols; j++)
    257                 {
    258                     pu1_u_dst[j] = pu1_u_src[j * 2];
    259                     pu1_v_dst[j] = pu1_v_src[j * 2];
    260                 }
    261 
    262                 pu1_u_dst += dst_strd;
    263                 pu1_v_dst += dst_strd;
    264                 pu1_u_src += src_strd;
    265                 pu1_v_src += src_strd;
    266             }
    267         }
    268     }
    269     return;
    270 }
    271