Home | History | Annotate | Download | only in x86
      1 /******************************************************************************
      2  *
      3  * Copyright (C) 2015 The Android Open Source Project
      4  *
      5  * Licensed under the Apache License, Version 2.0 (the "License");
      6  * you may not use this file except in compliance with the License.
      7  * You may obtain a copy of the License at:
      8  *
      9  * http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  *
     17  *****************************************************************************
     18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 */
     20 /**
     21 *******************************************************************************
     22 * @file
     23 *  ih264_chroma_intra_pred_filters_ssse3.c
     24 *
     25 * @brief
     26 *  Contains function definitions for chroma intra prediction filters in x86
     27 *  intrinsics
     28 *
     29 * @author
     30 *  Ittiam
     31 *
     32 * @par List of Functions:
     33 *  -ih264_intra_pred_chroma_8x8_mode_horz_ssse3
     34 *  -ih264_intra_pred_chroma_8x8_mode_vert_ssse3
     35 *  -ih264_intra_pred_chroma_8x8_mode_plane_ssse3
     36 *
     37 * @remarks
     38 *  None
     39 *
     40 *******************************************************************************
     41 */
     42 
     43 /*****************************************************************************/
     44 /* File Includes                                                             */
     45 /*****************************************************************************/
     46 
     47 /* System include files */
     48 #include <stdio.h>
     49 #include <stddef.h>
     50 #include <string.h>
     51 
     52 /* User include files */
     53 #include "ih264_defs.h"
     54 #include "ih264_typedefs.h"
     55 #include "ih264_macros.h"
     56 #include "ih264_platform_macros.h"
     57 #include "ih264_intra_pred_filters.h"
     58 
     59 
     60 /*****************************************************************************/
     61 /* Chroma Intra prediction 8x8 filters                                       */
     62 /*****************************************************************************/
     63 /**
     64 *******************************************************************************
     65 *
     66 * ih264_intra_pred_chroma_8x8_mode_horz_ssse3
     67 *
     68 * @brief
     69 *  Perform Intra prediction for chroma_8x8 mode:Horizontal
     70 *
     71 * @par Description:
     72 *  Perform Intra prediction for chroma_8x8 mode:Horizontal ,described in sec 8.3.4.2
     73 *
     74 * @param[in] pu1_src
     75 *  UWORD8 pointer to the source containing alternate U and V samples
     76 *
     77 * @param[out] pu1_dst
     78 *  UWORD8 pointer to the destination with alternate U and V samples
     79 *
     80 * @param[in] src_strd
     81 *  integer source stride
     82 *
     83 * @param[in] dst_strd
     84 *  integer destination stride
     85 *
     86 * @param[in] ngbr_avail
     87 * availability of neighbouring pixels(Not used in this function)
     88 *
     89 * @returns
     90 *
     91 * @remarks
     92 *  None
     93 *
     94 ******************************************************************************
     95 */
     96 void ih264_intra_pred_chroma_8x8_mode_horz_ssse3(UWORD8 *pu1_src,
     97                                                  UWORD8 *pu1_dst,
     98                                                  WORD32 src_strd,
     99                                                  WORD32 dst_strd,
    100                                                  WORD32 ngbr_avail)
    101 {
    102 
    103     UWORD8 *pu1_left; /* Pointer to start of top predictors */
    104     WORD32 dst_strd2;
    105 
    106     __m128i row1_16x8b, row2_16x8b;
    107 
    108     UNUSED(src_strd);
    109     UNUSED(ngbr_avail);
    110 
    111     pu1_left = pu1_src + 2 * BLK8x8SIZE - 2;
    112 
    113 
    114     dst_strd2 = dst_strd << 1;
    115     row1_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left)));
    116     row2_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 2)));
    117     _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
    118     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);
    119 
    120     pu1_dst += dst_strd2;
    121     row1_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 4)));
    122     row2_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 6)));
    123     _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
    124     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);
    125 
    126     pu1_dst += dst_strd2;
    127     row1_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 8)));
    128     row2_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 10)));
    129     _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
    130     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);
    131 
    132     pu1_dst += dst_strd2;
    133     row1_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 12)));
    134     row2_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 14)));
    135     _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
    136     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);
    137 }
    138 
    139 /**
    140 *******************************************************************************
    141 *
    142 * ih264_intra_pred_chroma_8x8_mode_vert_ssse3
    143 *
    144 * @brief
    145 *  Perform Intra prediction for  chroma_8x8 mode:vertical
    146 *
    147 * @par Description:
    148 *  Perform Intra prediction for  chroma_8x8 mode:vertical ,described in sec 8.3.4.3
    149 *
    150 * @param[in] pu1_src
    151 *  UWORD8 pointer to the source containing alternate U and V samples
    152 *
    153 * @param[out] pu1_dst
    154 *  UWORD8 pointer to the destination with alternate U and V samples
    155 *
    156 * @param[in] src_strd
    157 *  integer source stride
    158 *
    159 * @param[in] dst_strd
    160 *  integer destination stride
    161 *
    162 * @param[in] ngbr_avail
    163 * availability of neighbouring pixels(Not used in this function)
    164 *
    165 * @returns
    166 *
    167 * @remarks
    168 *  None
    169 *
    170 *******************************************************************************
    171 */
    172 void ih264_intra_pred_chroma_8x8_mode_vert_ssse3(UWORD8 *pu1_src,
    173                                                  UWORD8 *pu1_dst,
    174                                                  WORD32 src_strd,
    175                                                  WORD32 dst_strd,
    176                                                  WORD32 ngbr_avail)
    177 {
    178     UWORD8 *pu1_top; /* Pointer to start of top predictors */
    179     WORD32 dst_strd2;
    180 
    181     __m128i top_16x8b;
    182 
    183     UNUSED(src_strd);
    184     UNUSED(ngbr_avail);
    185 
    186     pu1_top = pu1_src + 2 * BLK8x8SIZE + 2;
    187 
    188     top_16x8b = _mm_loadu_si128((__m128i *)pu1_top);
    189 
    190     dst_strd2 = dst_strd << 1;
    191     _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
    192     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b);
    193 
    194     pu1_dst += dst_strd2;
    195     _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
    196     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b);
    197 
    198     pu1_dst += dst_strd2;
    199     _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
    200     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b);
    201 
    202     pu1_dst += dst_strd2;
    203     _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
    204     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b);
    205 }
    206 
    207 /**
    208 *******************************************************************************
    209 *
    210 * ih264_intra_pred_chroma_8x8_mode_plane_ssse3
    211 *
    212 * @brief
    213 *  Perform Intra prediction for chroma_8x8 mode:PLANE
    214 *
    215 * @par Description:
    216 *  Perform Intra prediction for chroma_8x8 mode:PLANE ,described in sec 8.3.4.4
    217 *
    218 * @param[in] pu1_src
    219 *  UWORD8 pointer to the source containing alternate U and V samples
    220 *
    221 * @param[out] pu1_dst
    222 *  UWORD8 pointer to the destination with alternate U and V samples
    223 *
    224 * @param[in] src_strd
    225 *  integer source stride
    226 *
    227 * @param[in] dst_strd
    228 *  integer destination stride
    229 *
    230 * @param[in] ngbr_avail
    231 * availability of neighbouring pixels(Not used in this function)
    232 *
    233 * @returns
    234 *
    235 * @remarks
    236 *  None
    237 *
    238 ******************************************************************************
    239 */
    240 void ih264_intra_pred_chroma_8x8_mode_plane_ssse3(UWORD8 *pu1_src,
    241                                                   UWORD8 *pu1_dst,
    242                                                   WORD32 src_strd,
    243                                                   WORD32 dst_strd,
    244                                                   WORD32 ngbr_avail)
    245 {
    246     UWORD8 *pu1_left, *pu1_top;
    247     WORD32 a_u, a_v, b_u, b_v, c_u, c_v;
    248 
    249     __m128i mul_8x16b, shuffle_8x16b;
    250 
    251     UNUSED(src_strd);
    252     UNUSED(ngbr_avail);
    253 
    254     pu1_top = pu1_src + MB_SIZE + 2;
    255     pu1_left = pu1_src + MB_SIZE - 2;
    256 
    257     mul_8x16b = _mm_setr_epi16(1, 2, 3, 4, 1, 2, 3, 4);
    258     shuffle_8x16b = _mm_setr_epi16(0xff00, 0xff02, 0xff04, 0xff06,
    259                                    0xff01, 0xff03, 0xff05, 0xff07);
    260 
    261     //calculating a, b and c
    262     {
    263         WORD32 h_u, h_v, v_u, v_v;
    264 
    265         __m128i h_val1_16x8b, h_val2_16x8b;
    266         __m128i h_val1_8x16b, h_val2_8x16b, h_val_4x32b;
    267         __m128i v_val1_16x8b, v_val2_16x8b;
    268         __m128i v_val1_8x16b, v_val2_8x16b, v_val_4x32b;
    269         __m128i hv_val_4x32b;
    270 
    271         h_val1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_top + 8));
    272         h_val2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_top - 2));
    273         v_val1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 14));
    274         v_val2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 4));
    275 
    276         // reversing the order
    277         h_val2_16x8b = _mm_shufflelo_epi16(h_val2_16x8b, 0x1b);
    278         v_val1_16x8b = _mm_shufflelo_epi16(v_val1_16x8b, 0x1b);
    279 
    280         // separating u and v and 8-bit to 16-bit conversion
    281         h_val1_8x16b = _mm_shuffle_epi8(h_val1_16x8b, shuffle_8x16b);
    282         h_val2_8x16b = _mm_shuffle_epi8(h_val2_16x8b, shuffle_8x16b);
    283         v_val1_8x16b = _mm_shuffle_epi8(v_val1_16x8b, shuffle_8x16b);
    284         v_val2_8x16b = _mm_shuffle_epi8(v_val2_16x8b, shuffle_8x16b);
    285 
    286         h_val1_8x16b = _mm_sub_epi16(h_val1_8x16b, h_val2_8x16b);
    287         v_val1_8x16b = _mm_sub_epi16(v_val1_8x16b, v_val2_8x16b);
    288 
    289         h_val_4x32b = _mm_madd_epi16(mul_8x16b, h_val1_8x16b);
    290         v_val_4x32b = _mm_madd_epi16(mul_8x16b, v_val1_8x16b);
    291 
    292         hv_val_4x32b = _mm_hadd_epi32(h_val_4x32b, v_val_4x32b);
    293 
    294         a_u = (pu1_left[7 * (-2)] + pu1_top[14]) << 4;
    295         a_v = (pu1_left[7 * (-2) + 1] + pu1_top[15]) << 4;
    296 
    297         h_u = _mm_extract_epi16(hv_val_4x32b, 0);
    298         h_v = _mm_extract_epi16(hv_val_4x32b, 2);
    299         v_u = _mm_extract_epi16(hv_val_4x32b, 4);
    300         v_v = _mm_extract_epi16(hv_val_4x32b, 6);
    301 
    302         h_u = (h_u << 16) >> 15; // sign-extension and multiplication by 2
    303         h_v = (h_v << 16) >> 15;
    304         v_u = (v_u << 16) >> 15;
    305         v_v = (v_v << 16) >> 15;
    306 
    307         b_u = ((h_u << 4) + h_u + 32) >> 6;
    308         b_v = ((h_v << 4) + h_v + 32) >> 6;
    309         c_u = ((v_u << 4) + v_u + 32) >> 6;
    310         c_v = ((v_v << 4) + v_v + 32) >> 6;
    311     }
    312     //using a, b and c to compute the fitted plane values
    313     {
    314         __m128i const_8x16b, c2_8x16b;
    315         __m128i res1_l_8x16b, res1_h_8x16b;
    316         __m128i res2_l_8x16b, res2_h_8x16b;
    317         __m128i res1_sh_l_8x16b, res1_sh_h_8x16b, res1_16x8b;
    318         __m128i res2_sh_l_8x16b, res2_sh_h_8x16b, res2_16x8b;
    319 
    320         WORD32 b_u2, b_v2, b_u3, b_v3;
    321         WORD32 const_u, const_v;
    322         WORD32 dst_strd2;
    323 
    324         const_u = a_u - (c_u << 1) - c_u + 16;
    325         const_v = a_v - (c_v << 1) - c_v + 16;
    326 
    327         b_u2 = b_u << 1;
    328         b_v2 = b_v << 1;
    329         b_u3 = b_u + b_u2;
    330         b_v3 = b_v + b_v2;
    331 
    332         const_8x16b = _mm_setr_epi16(const_u, const_v, const_u, const_v, const_u, const_v, const_u, const_v);
    333         res1_l_8x16b = _mm_setr_epi16(-b_u3, -b_v3, -b_u2, -b_v2, -b_u, -b_v, 0, 0);
    334         //contains {-b*3, -b*2, -b*1, b*0}
    335         res1_h_8x16b = _mm_setr_epi16(b_u, b_v, b_u2, b_v2, b_u3, b_v3, b_u << 2, b_v << 2);
    336         //contains {b*1, b*2, b*3, b*4}
    337         c2_8x16b = _mm_setr_epi16(c_u, c_v, c_u, c_v, c_u, c_v, c_u, c_v);
    338 
    339         // rows 1, 2
    340         res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, const_8x16b);
    341         res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, const_8x16b);
    342         res2_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b);
    343         res2_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b);
    344 
    345         res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
    346         res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
    347         res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
    348         res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);
    349 
    350         dst_strd2 = dst_strd << 1;
    351         c2_8x16b = _mm_slli_epi16(c2_8x16b, 1);
    352 
    353         res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
    354         res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);
    355 
    356         _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
    357         _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);
    358 
    359         // rows 3, 4
    360         res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b);
    361         res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b);
    362         res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b);
    363         res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b);
    364 
    365         res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
    366         res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
    367         res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
    368         res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);
    369 
    370         pu1_dst += dst_strd2;
    371 
    372         res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
    373         res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);
    374 
    375         _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
    376         _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);
    377 
    378         // rows 5, 6
    379         res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b);
    380         res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b);
    381         res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b);
    382         res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b);
    383 
    384         res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
    385         res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
    386         res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
    387         res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);
    388 
    389         pu1_dst += dst_strd2;
    390 
    391         res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
    392         res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);
    393 
    394         _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
    395         _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);
    396 
    397         // rows 7, 8
    398         res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b);
    399         res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b);
    400         res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b);
    401         res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b);
    402 
    403         res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
    404         res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
    405         res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
    406         res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);
    407 
    408         pu1_dst += dst_strd2;
    409 
    410         res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
    411         res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);
    412 
    413         _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
    414         _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);
    415 
    416     }
    417 }
    418