Home | History | Annotate | Download | only in x86
      1 /******************************************************************************
      2 *
      3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 *
      5 * Licensed under the Apache License, Version 2.0 (the "License");
      6 * you may not use this file except in compliance with the License.
      7 * You may obtain a copy of the License at:
      8 *
      9 * http://www.apache.org/licenses/LICENSE-2.0
     10 *
     11 * Unless required by applicable law or agreed to in writing, software
     12 * distributed under the License is distributed on an "AS IS" BASIS,
     13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 * See the License for the specific language governing permissions and
     15 * limitations under the License.
     16 *
     17 ******************************************************************************/
     18 /**
     19 *******************************************************************************
     20 * @file
     21 *  ihevc_chroma_intra_pred_filters_x86_intr.c
     22 *
     23 * @brief
     24 *  Contains function Definition for intra prediction  interpolation filters
     25 *
     26 *
     27 * @author
     28 *  Ittiam
     29 *
     30 * @par List of Functions:
     31 *  ihevc_intra_pred_chroma_planar_sse42()
     32 *
     33 *  ihevc_intra_pred_chroma_dc_sse42()
     34 *
     35 * @remarks
     36 *  None
     37 *
     38 *******************************************************************************
     39 */
     40 
     41 
     42 /*****************************************************************************/
     43 /* File Includes                                                             */
     44 /*****************************************************************************/
     45 
     46 #include "ihevc_typedefs.h"
     47 #include "ihevc_macros.h"
     48 #include "ihevc_func_selector.h"
     49 #include "ihevc_platform_macros.h"
     50 #include "ihevc_intra_pred.h"
     51 #include "ihevc_chroma_intra_pred.h"
     52 #include "ihevc_common_tables.h"
     53 #include "ihevc_tables_x86_intr.h"
     54 
     55 #include <mmintrin.h>
     56 #include <xmmintrin.h>
     57 #include <emmintrin.h>
     58 #include <smmintrin.h>
     59 #include <immintrin.h>
     60 
     61 
     62 /****************************************************************************/
     63 /* Constant Macros                                                          */
     64 /****************************************************************************/
     65 #define MAX_CU_SIZE 64
     66 #define BIT_DEPTH 8
     67 #define T32_4NT 128
     68 #define T16_4NT 64
     69 #define T16C_4NT 64
     70 #define T8C_4NT 32
     71 /****************************************************************************/
     72 /* Function Macros                                                          */
     73 /****************************************************************************/
     74 
     75 #define GET_BIT(y,x) ((y) & (1 << x)) && (1 << x)
     76 
     77 /* tables to shuffle 8-bit values */
     78 
     79 /*****************************************************************************/
     80 /* Function Definition                                                      */
     81 /*****************************************************************************/
     82 
     83 
     84 
     85 /**
     86 *******************************************************************************
     87 *
     88 * @brief
     89 *  Planar Intraprediction with reference neighboring samples location
     90 * pointed by 'pu1_ref' to the TU block location  pointed by 'pu1_dst'  Refer
     91 * to section 8.4.4.2.4 in the standard
     92 *
     93 * @par Description:
     94 *
     95 *
     96 * @param[in] pu1_src
     97 *  UWORD8 pointer to the source
     98 *
     99 * @param[in] pu1_dst
    100 *  UWORD8 pointer to the destination
    101 *
    102 * @param[in] src_strd
    103 *  integer source stride
    104 *
    105 * @param[in] dst_strd
    106 *  integer destination stride
    107 *
    108 * @param[in] nt
    109 *  integer Transform Block size
    110 *
    111 * @param[in] mode
    112 *  integer intraprediction mode
    113 *
    114 * @returns
    115 *
    116 * @remarks
    117 *  None
    118 *
    119 *******************************************************************************
    120 */
    121 
    122 void ihevc_intra_pred_chroma_planar_sse42(UWORD8 *pu1_ref,
    123                                           WORD32 src_strd,
    124                                           UWORD8 *pu1_dst,
    125                                           WORD32 dst_strd,
    126                                           WORD32 nt,
    127                                           WORD32 mode)
    128 {
    129 
    130     WORD32 row, col;
    131     WORD32 log2nt = 5;
    132     WORD32 two_nt, three_nt;
    133 
    134     __m128i const_temp_4x32b, const_temp1_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
    135     __m128i col_8x16b, const_temp5_4x32b, const_temp6_4x32b, zero_8x16b, const_temp7_4x32b;
    136     UNUSED(src_strd);
    137     UNUSED(mode);
    138 
    139     switch(nt)
    140     {
    141         case 16:
    142             log2nt = 4;
    143             break;
    144         case 8:
    145             log2nt = 3;
    146             break;
    147         case 4:
    148             log2nt = 2;
    149             break;
    150         default:
    151             break;
    152     }
    153     two_nt = 2 * nt;
    154     three_nt = 3 * nt;
    155 
    156     /* Planar filtering */
    157 
    158 /* setting vallues in  registera*/
    159 
    160 //  pu1_ref[2*(two_nt - 1 - row)]
    161 //  pu1_ref[2 * (three_nt + 1)]
    162 //  pu1_ref[2 * (two_nt + 1) + col]
    163 //  pu1_ref[2 * (nt - 1)]
    164 
    165     const_temp_4x32b  = _mm_set_epi16(pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)], pu1_ref[2 * (three_nt + 1) + 1],
    166                                       pu1_ref[2 * (three_nt + 1)], pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)],
    167                                       pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)]);
    168 
    169     const_temp1_4x32b = _mm_set_epi16(pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)], pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)],
    170                                       pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)], pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)]);
    171 
    172     const_temp4_4x32b = _mm_set1_epi16(nt - 1);
    173     const_temp6_4x32b = _mm_set1_epi16(nt);
    174     const_temp7_4x32b = _mm_set1_epi16(4);
    175 
    176     zero_8x16b = _mm_set1_epi32(0);
    177 
    178     if(nt % 4 == 0)
    179     {
    180         const_temp7_4x32b = _mm_set1_epi16(4);
    181 
    182         for(row = 0; row < nt; row++)
    183         {
    184             __m128i res_temp_8x16b, row_8x16b, res_temp1_8x16b, res_temp2_8x16b;
    185             __m128i res_temp3_8x16b;
    186 
    187             const_temp2_4x32b  = _mm_set_epi16(pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)], pu1_ref[2 * (two_nt - 1 - row) + 1],
    188                                                pu1_ref[2 * (two_nt - 1 - row)], pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)],
    189                                                pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)]);
    190 
    191             const_temp3_4x32b  = _mm_set1_epi16((row + 1));
    192             row_8x16b = _mm_set1_epi16((nt - 1 - row));
    193 
    194             const_temp5_4x32b = _mm_set_epi16(3, 3, 2, 2, 1, 1, 0, 0);
    195             col_8x16b = _mm_set_epi16(4, 4, 3, 3, 2, 2, 1, 1);
    196 
    197             const_temp5_4x32b = _mm_sub_epi16(const_temp4_4x32b, const_temp5_4x32b);
    198 
    199             /*(row + 1) * pu1_ref[nt - 1]*/
    200             res_temp_8x16b  = _mm_mullo_epi16(const_temp3_4x32b,  const_temp1_4x32b);
    201 
    202             /*(row + 1) * pu1_ref[nt - 1] + nt)*/
    203             res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, const_temp6_4x32b);
    204 
    205             for(col = 0; col < 2 * nt; col += 8)
    206             {
    207                 __m128i src_temp_8x16b;
    208 
    209                 /* loding 8bit 16 pixles*/
    210                 src_temp_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (two_nt + 1) + col));
    211 
    212                 src_temp_8x16b =  _mm_cvtepu8_epi16(src_temp_8x16b); /* row=0*/
    213 
    214                 /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] */
    215                 res_temp1_8x16b  = _mm_mullo_epi16(src_temp_8x16b,  row_8x16b);
    216 
    217                 /*(col + 1) * pu1_ref[three_nt + 1]*/
    218                 res_temp2_8x16b  = _mm_mullo_epi16(const_temp_4x32b,  col_8x16b);
    219 
    220                 /*(nt - 1 - col)* pu1_ref[two_nt - 1 - row]*/
    221                 res_temp3_8x16b  = _mm_mullo_epi16(const_temp2_4x32b,  const_temp5_4x32b);
    222 
    223                 res_temp1_8x16b = _mm_add_epi16(res_temp_8x16b, res_temp1_8x16b);
    224                 res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
    225                 res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp3_8x16b);
    226 
    227                 res_temp1_8x16b = _mm_srli_epi16(res_temp1_8x16b, (log2nt + 1));
    228                 res_temp1_8x16b = _mm_packus_epi16(res_temp1_8x16b, zero_8x16b);
    229 
    230                 _mm_storel_epi64((__m128i *)(pu1_dst + (row * dst_strd) + col), res_temp1_8x16b);
    231 
    232                 const_temp5_4x32b = _mm_sub_epi16(const_temp5_4x32b, const_temp7_4x32b);
    233                 col_8x16b = _mm_add_epi16(col_8x16b, const_temp7_4x32b);
    234             } /* inner loop ends here */
    235         }
    236     }
    237 }
    238 
    239 /**
    240 *******************************************************************************
    241 *
    242 * @brief
    243 *  Intraprediction for DC mode with reference neighboring  samples location
    244 * pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'  Refer
    245 * to section 8.4.4.2.5 in the standard
    246 *
    247 * @par Description:
    248 *
    249 *
    250 * @param[in] pu1_src
    251 *  UWORD8 pointer to the source
    252 *
    253 * @param[in] pu1_dst
    254 *  UWORD8 pointer to the destination
    255 *
    256 * @param[in] src_strd
    257 *  integer source stride
    258 *
    259 * @param[in] dst_strd
    260 *  integer destination stride
    261 *
    262 * @param[in] nt
    263 *  integer Transform Block size (Chroma)
    264 *
    265 * @param[in] mode
    266 *  integer intraprediction mode
    267 *
    268 * @returns
    269 *
    270 * @remarks
    271 *  None
    272 *
    273 *******************************************************************************
    274 */
    275 
    276 void ihevc_intra_pred_chroma_dc_sse42(UWORD8 *pu1_ref,
    277                                       WORD32 src_strd,
    278                                       UWORD8 *pu1_dst,
    279                                       WORD32 dst_strd,
    280                                       WORD32 nt,
    281                                       WORD32 mode)
    282 {
    283 
    284     WORD32 acc_dc_u, acc_dc_v;
    285     WORD32 dc_val_u, dc_val_v;
    286     WORD32 row;
    287     WORD32 log2nt = 5;
    288     __m128i src_temp1, src_temp3, src_temp4, src_temp5, src_temp6, m_mask;
    289     __m128i src_temp7, src_temp8, src_temp9, src_temp10;
    290     __m128i m_zero = _mm_set1_epi32(0);
    291     UNUSED(src_strd);
    292     UNUSED(mode);
    293 
    294     switch(nt)
    295     {
    296         case 32:
    297             log2nt = 5;
    298             break;
    299         case 16:
    300             log2nt = 4;
    301             break;
    302         case 8:
    303             log2nt = 3;
    304             break;
    305         case 4:
    306             log2nt = 2;
    307             break;
    308         default:
    309             break;
    310     }
    311 
    312     acc_dc_u = 0;
    313     acc_dc_v = 0;
    314 
    315     /* Calculate DC value for the transform block */
    316 
    317     m_mask = _mm_loadu_si128((__m128i *)&IHEVCE_SHUFFLEMASKY9[0]);
    318 
    319     if(nt == 16)
    320     {
    321         __m128i temp_sad;
    322 
    323         src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
    324         src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 16));
    325         src_temp7 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 32));
    326         src_temp8 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 48));
    327 
    328         src_temp5 =  _mm_cvtepu8_epi16(src_temp3);
    329         src_temp6 =  _mm_cvtepu8_epi16(src_temp4);
    330         src_temp9 =  _mm_cvtepu8_epi16(src_temp7);
    331         src_temp10 =  _mm_cvtepu8_epi16(src_temp8);
    332 
    333         src_temp3 = _mm_srli_si128(src_temp3, 8);
    334         src_temp4 = _mm_srli_si128(src_temp4, 8);
    335         src_temp7 = _mm_srli_si128(src_temp7, 8);
    336         src_temp8 = _mm_srli_si128(src_temp8, 8);
    337 
    338         src_temp3 =  _mm_cvtepu8_epi16(src_temp3);
    339         src_temp4 =  _mm_cvtepu8_epi16(src_temp4);
    340         src_temp7 =  _mm_cvtepu8_epi16(src_temp7);
    341         src_temp8 =  _mm_cvtepu8_epi16(src_temp8);
    342 
    343         src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
    344         src_temp6 = _mm_add_epi16(src_temp3, src_temp5);
    345         src_temp8 = _mm_add_epi16(src_temp7, src_temp8);
    346         src_temp10 = _mm_add_epi16(src_temp9, src_temp10);
    347 
    348         src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
    349         src_temp8 = _mm_add_epi16(src_temp8, src_temp10);
    350 
    351         src_temp4 = _mm_add_epi16(src_temp4, src_temp8);
    352         src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
    353         src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
    354         src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
    355 
    356         src_temp4 = _mm_cvtepi16_epi32(src_temp4);
    357         temp_sad  = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
    358         acc_dc_u  = _mm_cvtsi128_si32(src_temp4);
    359         acc_dc_v  = _mm_cvtsi128_si32(temp_sad);
    360     }
    361 
    362     else if(nt == 8)
    363     {
    364         __m128i temp_sad;
    365         src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
    366         src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 16));
    367 
    368         src_temp5 =  _mm_cvtepu8_epi16(src_temp3);
    369         src_temp6 =  _mm_cvtepu8_epi16(src_temp4);
    370 
    371         src_temp3 = _mm_srli_si128(src_temp3, 8);
    372         src_temp4 = _mm_srli_si128(src_temp4, 8);
    373 
    374         src_temp3 =  _mm_cvtepu8_epi16(src_temp3);
    375         src_temp4 =  _mm_cvtepu8_epi16(src_temp4);
    376 
    377         src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
    378         src_temp6 = _mm_add_epi16(src_temp3, src_temp5);
    379 
    380         src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
    381         src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
    382         src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
    383         src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
    384 
    385         src_temp4 = _mm_cvtepi16_epi32(src_temp4);
    386         temp_sad  = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
    387         acc_dc_u  = _mm_cvtsi128_si32(src_temp4);
    388         acc_dc_v  = _mm_cvtsi128_si32(temp_sad);
    389     }
    390 
    391     else if(nt == 4)
    392     {
    393         __m128i temp_sad;
    394         src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
    395 
    396         src_temp5 =  _mm_cvtepu8_epi16(src_temp3);
    397         src_temp4 = _mm_srli_si128(src_temp3, 8);
    398         src_temp4 =  _mm_cvtepu8_epi16(src_temp4);
    399 
    400         src_temp4 = _mm_add_epi16(src_temp4, src_temp5);
    401 
    402         src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
    403         src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
    404         src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
    405 
    406         src_temp4 = _mm_cvtepi16_epi32(src_temp4);
    407         temp_sad  = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
    408         acc_dc_u  = _mm_cvtsi128_si32(src_temp4);
    409         acc_dc_v  = _mm_cvtsi128_si32(temp_sad);
    410     }
    411 
    412 
    413     acc_dc_u += pu1_ref[6 * nt];
    414     acc_dc_v += pu1_ref[6 * nt + 1];
    415 
    416     acc_dc_u -= pu1_ref[4 * nt];
    417     acc_dc_v -= pu1_ref[4 * nt + 1];
    418 
    419     dc_val_u = (acc_dc_u + nt) >> (log2nt + 1);
    420     dc_val_v = (acc_dc_v + nt) >> (log2nt + 1);
    421 
    422     dc_val_u = dc_val_u | (dc_val_v << 8);
    423 
    424     /* Fill the remaining rows with DC value*/
    425 
    426     if(nt == 4)
    427     {
    428         src_temp1 = _mm_set1_epi16(dc_val_u);
    429 
    430         /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
    431         _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
    432         _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
    433         _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
    434         _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
    435 
    436     }
    437     else if(nt == 8)
    438     {
    439         src_temp1 = _mm_set1_epi16(dc_val_u);
    440 
    441         /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
    442         _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
    443         _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
    444         _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
    445         _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
    446 
    447         _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp1);
    448         _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp1);
    449         _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp1);
    450         _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1);
    451 
    452     }
    453 
    454     else /* nt == 16 */
    455     {
    456 
    457         src_temp1 = _mm_set1_epi16(dc_val_u);
    458 
    459         for(row = 0; row < nt; row += 8)
    460         {
    461             /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
    462             _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
    463             _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
    464             _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
    465             _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
    466             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp1);
    467             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp1);
    468             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp1);
    469             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp1);
    470 
    471             _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp1);
    472             _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp1);
    473             _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp1);
    474             _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1);
    475             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp1);
    476             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp1);
    477             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp1);
    478             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp1);
    479 
    480             pu1_dst += 8 * dst_strd;
    481         }
    482 
    483 
    484     }
    485 
    486 }
    487