Home | History | Annotate | Download | only in x86
      1 /******************************************************************************
      2 *
      3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 *
      5 * Licensed under the Apache License, Version 2.0 (the "License");
      6 * you may not use this file except in compliance with the License.
      7 * You may obtain a copy of the License at:
      8 *
      9 * http://www.apache.org/licenses/LICENSE-2.0
     10 *
     11 * Unless required by applicable law or agreed to in writing, software
     12 * distributed under the License is distributed on an "AS IS" BASIS,
     13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 * See the License for the specific language governing permissions and
     15 * limitations under the License.
     16 *
     17 ******************************************************************************/
     18 /**
     19 *******************************************************************************
     20 * @file
     21 *  ihevcd_it_rec_dc_x86_intr.c
     22 *
     23 * @brief
     24 *  Platform specific intrinsic implementation of certain functions
     25 *
     26 * @author
     27 *  Ittiam
     28 * @par List of Functions:
     29 *  - ihevcd_itrans_recon_dc
     30 *  - ihevcd_fmt_conv_420sp_to_420p
     31 *
     32 * @remarks
     33 *  None
     34 *
     35 *******************************************************************************
     36 */
     37 
     38 #include "ihevc_typedefs.h"
     39 #include "ihevc_defs.h"
     40 #include "ihevc_macros.h"
     41 #include "ihevc_platform_macros.h"
     42 #include "ihevcd_function_selector.h"
     43 
     44 #include <immintrin.h>
     45 
     46 
     47 void ihevcd_itrans_recon_dc_luma_sse42(UWORD8 *pu1_pred, UWORD8 *pu1_dst, WORD32 pred_strd, WORD32 dst_strd,
     48                                        WORD32 log2_trans_size, WORD16 i2_coeff_value)
     49 {
     50     __m128i m_temp_reg_0;
     51     __m128i m_temp_reg_1;
     52     __m128i m_temp_reg_2;
     53     __m128i m_temp_reg_3;
     54     __m128i m_temp_reg_4;
     55     __m128i m_temp_reg_5;
     56     __m128i m_temp_reg_6;
     57     __m128i m_temp_reg_7;
     58     __m128i m_temp_reg_8;
     59     __m128i m_temp_reg_9;
     60     __m128i m_temp_reg_10;
     61     __m128i m_temp_reg_11;
     62     __m128i m_temp_reg_12;
     63     __m128i m_temp_reg_13;
     64     __m128i m_temp_reg_14;
     65     __m128i m_temp_reg_15;
     66     __m128i m_temp_reg_20, zero_8x16b;
     67     __m128i *pi4_dst = (__m128i *)pu1_dst;
     68 
     69 
     70     //WORD32 row,col;
     71     WORD32 add, shift;
     72     WORD32 dc_value, quant_out;
     73     WORD32 trans_size;
     74 
     75 
     76 
     77 
     78     trans_size = (1 << log2_trans_size);
     79 
     80     quant_out = i2_coeff_value;
     81 
     82     shift = IT_SHIFT_STAGE_1;
     83     add = 1 << (shift - 1);
     84     dc_value = CLIP_S16((quant_out * 64 + add) >> shift);
     85     shift = IT_SHIFT_STAGE_2;
     86     add = 1 << (shift - 1);
     87     dc_value = CLIP_S16((dc_value * 64 + add) >> shift);
     88 
     89     /*Replicate the DC value within 16 bits in 128 bit register*/
     90     m_temp_reg_20 = _mm_set1_epi16(dc_value);
     91     zero_8x16b = _mm_setzero_si128();
     92 
     93     if(trans_size == 4)
     94     {
     95         WORD32 *pi4_dst = (WORD32 *)pu1_dst;
     96 
     97         m_temp_reg_0 = _mm_loadl_epi64((__m128i *)(pu1_pred));
     98         m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd));
     99         m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd));
    100         m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd));
    101 
    102         m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_0, m_temp_reg_1);
    103         m_temp_reg_5 = _mm_unpacklo_epi32(m_temp_reg_2, m_temp_reg_3);
    104 
    105         m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, zero_8x16b);
    106         m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, zero_8x16b);
    107 
    108         m_temp_reg_6 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
    109         m_temp_reg_7 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
    110 
    111         m_temp_reg_8 = _mm_packus_epi16(m_temp_reg_6, m_temp_reg_7);
    112 
    113 
    114         *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_8);
    115         m_temp_reg_1 = _mm_srli_si128(m_temp_reg_8, 4);
    116         m_temp_reg_2 = _mm_srli_si128(m_temp_reg_8, 8);
    117         m_temp_reg_3 = _mm_srli_si128(m_temp_reg_8, 12);
    118         pu1_dst += dst_strd;
    119         pi4_dst = (WORD32 *)(pu1_dst);
    120 
    121         *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_1);
    122         pu1_dst += dst_strd;
    123         pi4_dst = (WORD32 *)(pu1_dst);
    124 
    125         *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_2);
    126         pu1_dst += dst_strd;
    127         pi4_dst = (WORD32 *)(pu1_dst);
    128 
    129         *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_3);
    130     }
    131     else
    132     {
    133         WORD32 i, j;
    134 
    135         for(i = 1; i <= trans_size; i += 4)
    136         {
    137             for(j = 1; j <= trans_size; j += 8)
    138             {
    139 
    140                 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
    141                 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd));
    142                 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd));
    143                 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd));
    144 
    145 
    146                 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_0, zero_8x16b);
    147                 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_1, zero_8x16b);
    148                 m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_2, zero_8x16b);
    149                 m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_3, zero_8x16b);
    150 
    151                 m_temp_reg_8 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
    152                 m_temp_reg_9 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
    153                 m_temp_reg_10 = _mm_add_epi16(m_temp_reg_6, m_temp_reg_20);
    154                 m_temp_reg_11 = _mm_add_epi16(m_temp_reg_7, m_temp_reg_20);
    155 
    156                 pi4_dst = (__m128i *)(pu1_dst);
    157 
    158                 m_temp_reg_12 = _mm_packus_epi16(m_temp_reg_8, m_temp_reg_9);
    159                 _mm_storel_epi64(pi4_dst, m_temp_reg_12);
    160 
    161                 pi4_dst = (__m128i *)(pu1_dst + dst_strd);
    162 
    163                 m_temp_reg_13 = _mm_srli_si128(m_temp_reg_12, 8);
    164                 _mm_storel_epi64(pi4_dst, m_temp_reg_13);
    165 
    166                 pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd);
    167 
    168                 m_temp_reg_14 = _mm_packus_epi16(m_temp_reg_10, m_temp_reg_11);
    169                 _mm_storel_epi64(pi4_dst, m_temp_reg_14);
    170 
    171                 pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd);
    172 
    173                 m_temp_reg_15 = _mm_srli_si128(m_temp_reg_14, 8);
    174                 _mm_storel_epi64(pi4_dst, m_temp_reg_15);
    175 
    176                 pu1_pred += 8;
    177                 pu1_dst += 8;
    178             }
    179             pu1_pred += 4 * pred_strd - trans_size;
    180             pu1_dst += 4 * dst_strd - trans_size;
    181         }
    182     }
    183 
    184 
    185 }
    186 
    187 void ihevcd_itrans_recon_dc_chroma_sse42(UWORD8 *pu1_pred, UWORD8 *pu1_dst, WORD32 pred_strd, WORD32 dst_strd,
    188                                          WORD32 log2_trans_size, WORD16 i2_coeff_value)
    189 {
    190     __m128i m_temp_reg_0;
    191     __m128i m_temp_reg_1;
    192     __m128i m_temp_reg_2;
    193     __m128i m_temp_reg_3;
    194     __m128i m_temp_reg_4;
    195     __m128i m_temp_reg_5;
    196     __m128i m_temp_reg_6;
    197     __m128i m_temp_reg_7;
    198     __m128i m_temp_reg_8;
    199     __m128i m_temp_reg_9;
    200     __m128i m_temp_reg_10;
    201     __m128i m_temp_reg_11;
    202     __m128i m_temp_reg_12;
    203     __m128i m_temp_reg_13;
    204     __m128i m_temp_reg_14;
    205     __m128i m_temp_reg_15;
    206     __m128i m_temp_reg_20, zero_8x16b;
    207     __m128i *pi4_dst = (__m128i *)pu1_dst;
    208 
    209 
    210     //WORD32 row,col;
    211     WORD32 add, shift;
    212     WORD32 dc_value, quant_out;
    213     WORD32 trans_size;
    214 
    215 
    216     WORD32 shuffle_mask_4x4 = 0x06040200;
    217     WORD32 unchanged_mask_4x4 = 0x07050301;
    218     LWORD64 shuffle_mask = 0x0E0C0A0806040200LL;
    219     LWORD64 unchanged_mask = 0x0F0D0B0907050301LL;
    220 
    221     trans_size = (1 << log2_trans_size);
    222 
    223     quant_out = i2_coeff_value;
    224 
    225     shift = IT_SHIFT_STAGE_1;
    226     add = 1 << (shift - 1);
    227     dc_value = CLIP_S16((quant_out * 64 + add) >> shift);
    228     shift = IT_SHIFT_STAGE_2;
    229     add = 1 << (shift - 1);
    230     dc_value = CLIP_S16((dc_value * 64 + add) >> shift);
    231 
    232     /*Replicate the DC value within 16 bits in 128 bit register*/
    233     m_temp_reg_20 = _mm_set1_epi16(dc_value);
    234     zero_8x16b = _mm_setzero_si128();
    235 
    236     if(trans_size == 4)
    237     {
    238         __m128i chroma_shuffle_mask_16x8b;
    239         __m128i chroma_unchanged_mask_16x8b;
    240         chroma_shuffle_mask_16x8b = _mm_cvtsi32_si128(shuffle_mask_4x4);
    241         chroma_unchanged_mask_16x8b = _mm_cvtsi32_si128(unchanged_mask_4x4);
    242 
    243         /*Load the prediction data*/
    244         m_temp_reg_0 = _mm_loadl_epi64((__m128i *)(pu1_pred));
    245         m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd));
    246         m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd));
    247         m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd));
    248 
    249         m_temp_reg_10  = _mm_shuffle_epi8(m_temp_reg_0, chroma_shuffle_mask_16x8b);
    250         m_temp_reg_11  = _mm_shuffle_epi8(m_temp_reg_1, chroma_shuffle_mask_16x8b);
    251         m_temp_reg_12  = _mm_shuffle_epi8(m_temp_reg_2, chroma_shuffle_mask_16x8b);
    252         m_temp_reg_13  = _mm_shuffle_epi8(m_temp_reg_3, chroma_shuffle_mask_16x8b);
    253 
    254         m_temp_reg_14 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
    255         m_temp_reg_15 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
    256 
    257         m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_14, zero_8x16b);
    258         m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_15, zero_8x16b);
    259 
    260         m_temp_reg_6 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
    261         m_temp_reg_7 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
    262 
    263         /*Load the recon data to make sure that 'v' is not corrupted when 'u' is called and vice versa*/
    264         m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_dst);
    265         m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd));
    266         m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_dst + 2 * dst_strd));
    267         m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_dst + 3 * dst_strd));
    268 
    269         m_temp_reg_0  = _mm_shuffle_epi8(m_temp_reg_0, chroma_unchanged_mask_16x8b);
    270         m_temp_reg_1  = _mm_shuffle_epi8(m_temp_reg_1, chroma_unchanged_mask_16x8b);
    271         m_temp_reg_2  = _mm_shuffle_epi8(m_temp_reg_2, chroma_unchanged_mask_16x8b);
    272         m_temp_reg_3  = _mm_shuffle_epi8(m_temp_reg_3, chroma_unchanged_mask_16x8b);
    273 
    274 
    275         m_temp_reg_8 = _mm_packus_epi16(m_temp_reg_6, m_temp_reg_7);
    276         m_temp_reg_9 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_0);
    277         m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4);
    278         m_temp_reg_10 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_1);
    279         m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4);
    280         m_temp_reg_11 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_2);
    281         m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4);
    282         m_temp_reg_12 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_3);
    283 
    284         /*Store the result in the destination*/
    285         _mm_storel_epi64(pi4_dst, m_temp_reg_9);
    286         pu1_dst += dst_strd;
    287         pi4_dst = (__m128i *)(pu1_dst);
    288 
    289 
    290         _mm_storel_epi64(pi4_dst, m_temp_reg_10);
    291         pu1_dst += dst_strd;
    292         pi4_dst = (__m128i *)(pu1_dst);
    293 
    294         _mm_storel_epi64(pi4_dst, m_temp_reg_11);
    295         pu1_dst += dst_strd;
    296         pi4_dst = (__m128i *)(pu1_dst);
    297 
    298         _mm_storel_epi64(pi4_dst, m_temp_reg_12);
    299     }
    300     else
    301     {
    302         WORD32 i, j;
    303         __m128i chroma_shuffle_mask_16x8b;
    304         __m128i chroma_unchanged_mask_16x8b;
    305         chroma_shuffle_mask_16x8b = _mm_loadl_epi64((__m128i *)(&shuffle_mask));
    306         chroma_unchanged_mask_16x8b =
    307                         _mm_loadl_epi64((__m128i *)(&unchanged_mask));
    308 
    309         for(i = 0; i < trans_size; i += 4)
    310         {
    311             for(j = 0; j < trans_size; j += 8)
    312             {
    313 
    314                 m_temp_reg_0 = _mm_loadu_si128((__m128i *)pu1_pred);
    315                 m_temp_reg_1 = _mm_loadu_si128((__m128i *)(pu1_pred + pred_strd));
    316                 m_temp_reg_2 = _mm_loadu_si128((__m128i *)(pu1_pred + 2 * pred_strd));
    317                 m_temp_reg_3 = _mm_loadu_si128((__m128i *)(pu1_pred + 3 * pred_strd));
    318 
    319                 /*Retain only one chroma component*/
    320                 m_temp_reg_4  = _mm_shuffle_epi8(m_temp_reg_0, chroma_shuffle_mask_16x8b);
    321                 m_temp_reg_5  = _mm_shuffle_epi8(m_temp_reg_1, chroma_shuffle_mask_16x8b);
    322                 m_temp_reg_6  = _mm_shuffle_epi8(m_temp_reg_2, chroma_shuffle_mask_16x8b);
    323                 m_temp_reg_7  = _mm_shuffle_epi8(m_temp_reg_3, chroma_shuffle_mask_16x8b);
    324 
    325                 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, zero_8x16b);
    326                 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, zero_8x16b);
    327                 m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, zero_8x16b);
    328                 m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, zero_8x16b);
    329 
    330                 m_temp_reg_8 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
    331                 m_temp_reg_9 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
    332                 m_temp_reg_10 = _mm_add_epi16(m_temp_reg_6, m_temp_reg_20);
    333                 m_temp_reg_11 = _mm_add_epi16(m_temp_reg_7, m_temp_reg_20);
    334 
    335 
    336                 /*Load the recon data to make sure that 'v' is not corrupted when 'u' is called and vice versa*/
    337                 m_temp_reg_0 = _mm_loadu_si128((__m128i *)pu1_dst);
    338                 m_temp_reg_1 = _mm_loadu_si128((__m128i *)(pu1_dst + dst_strd));
    339                 m_temp_reg_2 = _mm_loadu_si128((__m128i *)(pu1_dst + 2 * dst_strd));
    340                 m_temp_reg_3 = _mm_loadu_si128((__m128i *)(pu1_dst + 3 * dst_strd));
    341 
    342                 m_temp_reg_0  = _mm_shuffle_epi8(m_temp_reg_0, chroma_unchanged_mask_16x8b);
    343                 m_temp_reg_1  = _mm_shuffle_epi8(m_temp_reg_1, chroma_unchanged_mask_16x8b);
    344                 m_temp_reg_2  = _mm_shuffle_epi8(m_temp_reg_2, chroma_unchanged_mask_16x8b);
    345                 m_temp_reg_3  = _mm_shuffle_epi8(m_temp_reg_3, chroma_unchanged_mask_16x8b);
    346 
    347                 m_temp_reg_4 = _mm_packus_epi16(m_temp_reg_8, m_temp_reg_9);
    348                 m_temp_reg_5 = _mm_packus_epi16(m_temp_reg_10, m_temp_reg_11);
    349 
    350                 m_temp_reg_12 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_0);
    351                 m_temp_reg_4 = _mm_srli_si128(m_temp_reg_4, 8);
    352                 m_temp_reg_13 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_1);
    353 
    354                 m_temp_reg_14 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_2);
    355                 m_temp_reg_5 = _mm_srli_si128(m_temp_reg_5, 8);
    356                 m_temp_reg_15 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_3);
    357 
    358                 /*Store the result in the destination*/
    359                 pi4_dst = (__m128i *)(pu1_dst);
    360 
    361                 _mm_storel_epi64(pi4_dst, m_temp_reg_12);
    362                 m_temp_reg_8 = _mm_srli_si128(m_temp_reg_12, 8);
    363 
    364                 pi4_dst = (__m128i *)(pu1_dst + 8);
    365                 _mm_storel_epi64(pi4_dst, m_temp_reg_8);
    366 
    367                 pi4_dst = (__m128i *)(pu1_dst + dst_strd);
    368 
    369                 _mm_storel_epi64(pi4_dst, m_temp_reg_13);
    370                 m_temp_reg_9 = _mm_srli_si128(m_temp_reg_13, 8);
    371 
    372                 pi4_dst = (__m128i *)(pu1_dst + dst_strd + 8);
    373                 _mm_storel_epi64(pi4_dst, m_temp_reg_9);
    374 
    375                 pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd);
    376 
    377                 _mm_storel_epi64(pi4_dst, m_temp_reg_14);
    378                 m_temp_reg_10 = _mm_srli_si128(m_temp_reg_14, 8);
    379 
    380                 pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd + 8);
    381                 _mm_storel_epi64(pi4_dst, m_temp_reg_10);
    382 
    383                 pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd);
    384 
    385                 _mm_storel_epi64(pi4_dst, m_temp_reg_15);
    386                 m_temp_reg_11 = _mm_srli_si128(m_temp_reg_15, 8);
    387 
    388                 pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd + 8);
    389                 _mm_storel_epi64(pi4_dst, m_temp_reg_11);
    390 
    391                 pu1_pred += 16;
    392                 pu1_dst += 16;
    393             }
    394 
    395             pu1_pred += 4 * pred_strd - 2 * trans_size;
    396             pu1_dst += 4 * dst_strd - 2 * trans_size;
    397         }
    398     }
    399 
    400 
    401 }
    402