Home | History | Annotate | Download | only in x86
      1 /******************************************************************************
      2  *
      3  * Copyright (C) 2015 The Android Open Source Project
      4  *
      5  * Licensed under the Apache License, Version 2.0 (the "License");
      6  * you may not use this file except in compliance with the License.
      7  * You may obtain a copy of the License at:
      8  *
      9  * http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  *
     17  *****************************************************************************
     18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 */
     20 
     21 /**
     22  *******************************************************************************
     23  * @file
     24  *  impeg2_inter_pred_sse42_intr.c
     25  *
     26  * @brief
     27  *  Contains Motion compensation function definitions for MPEG2 decoder
     28  *
     29  * @author
     30  *  Mohit [100664]
     31  *
     32  * - impeg2_copy_mb_sse42()
     33  * - impeg2_interpolate_sse42()
     34  * - impeg2_mc_halfx_halfy_8x8_sse42()
     35  * - impeg2_mc_halfx_fully_8x8_sse42()
     36  * - impeg2_mc_fullx_halfy_8x8_sse42()
     37  * - impeg2_mc_fullx_fully_8x8_sse42()
     38  *
     39  * @remarks
     40  *  None
     41  *
     42  *******************************************************************************
     43  */
     44 #include <stdio.h>
     45 #include <string.h>
     46 #include "iv_datatypedef.h"
     47 #include "impeg2_macros.h"
     48 #include "impeg2_defs.h"
     49 #include "impeg2_inter_pred.h"
     50 
     51 #include <immintrin.h>
     52 #include <emmintrin.h>
     53 #include <smmintrin.h>
     54 #include <tmmintrin.h>
     55 
     56 /*******************************************************************************
     57 *  Function Name   : impeg2_copy_mb
     58 *
     59 *  Description     : copies 3 components to the frame from mc_buf
     60 *
     61 *  Arguments       :
     62 *  src_buf         : Source Buffer
     63 *  dst_buf         : Destination Buffer
     64 *  src_wd          : Source Width
     65 *  dst_wd          : destination Width
     66 *
     67 *  Values Returned : None
     68 *******************************************************************************/
     69 void impeg2_copy_mb_sse42(yuv_buf_t *src_buf,
     70                     yuv_buf_t *dst_buf,
     71                     UWORD32 src_wd,
     72                     UWORD32 dst_wd)
     73 {
     74     UWORD8 *src;
     75     UWORD8 *dst;
     76     __m128i src_r0, src_r1, src_r2, src_r3;
     77 
     78     /*******************************************************/
     79     /* copy Y                                              */
     80     /*******************************************************/
     81     src = src_buf->pu1_y;
     82     dst = dst_buf->pu1_y;
     83     // Row 0-3
     84     src_r0 = _mm_loadu_si128((__m128i *) (src));
     85     src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd));
     86     src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd));
     87     src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd));
     88 
     89     _mm_storeu_si128((__m128i *) dst, src_r0);
     90     _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1);
     91     _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2);
     92     _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3);
     93 
     94     // Row 4-7
     95     src += 4 * src_wd;
     96     dst += 4 * dst_wd;
     97     src_r0 = _mm_loadu_si128((__m128i *) (src));
     98     src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd));
     99     src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd));
    100     src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd));
    101 
    102     _mm_storeu_si128((__m128i *) dst, src_r0);
    103     _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1);
    104     _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2);
    105     _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3);
    106 
    107     // Row 8-11
    108     src += 4 * src_wd;
    109     dst += 4 * dst_wd;
    110     src_r0 = _mm_loadu_si128((__m128i *) (src));
    111     src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd));
    112     src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd));
    113     src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd));
    114 
    115     _mm_storeu_si128((__m128i *) dst, src_r0);
    116     _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1);
    117     _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2);
    118     _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3);
    119 
    120     // Row 12-15
    121     src += 4 * src_wd;
    122     dst += 4 * dst_wd;
    123     src_r0 = _mm_loadu_si128((__m128i *) (src));
    124     src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd));
    125     src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd));
    126     src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd));
    127 
    128     _mm_storeu_si128((__m128i *) dst, src_r0);
    129     _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1);
    130     _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2);
    131     _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3);
    132 
    133     src_wd >>= 1;
    134     dst_wd >>= 1;
    135 
    136     /*******************************************************/
    137     /* copy U                                              */
    138     /*******************************************************/
    139     src = src_buf->pu1_u;
    140     dst = dst_buf->pu1_u;
    141 
    142     // Row 0-3
    143     src_r0 =  _mm_loadl_epi64((__m128i *)src);
    144     src_r1 =  _mm_loadl_epi64((__m128i *)(src + src_wd));
    145     src_r2 =  _mm_loadl_epi64((__m128i *)(src + 2 * src_wd));
    146     src_r3 =  _mm_loadl_epi64((__m128i *)(src + 3 * src_wd));
    147 
    148     _mm_storel_epi64((__m128i *)dst, src_r0);
    149     _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1);
    150     _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2);
    151     _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3);
    152 
    153     // Row 4-7
    154     src += 4 * src_wd;
    155     dst += 4 * dst_wd;
    156 
    157     src_r0 =  _mm_loadl_epi64((__m128i *)src);
    158     src_r1 =  _mm_loadl_epi64((__m128i *)(src + src_wd));
    159     src_r2 =  _mm_loadl_epi64((__m128i *)(src + 2 * src_wd));
    160     src_r3 =  _mm_loadl_epi64((__m128i *)(src + 3 * src_wd));
    161 
    162     _mm_storel_epi64((__m128i *)dst, src_r0);
    163     _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1);
    164     _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2);
    165     _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3);
    166 
    167     /*******************************************************/
    168     /* copy V                                              */
    169     /*******************************************************/
    170     src = src_buf->pu1_v;
    171     dst = dst_buf->pu1_v;
    172     // Row 0-3
    173     src_r0 =  _mm_loadl_epi64((__m128i *)src);
    174     src_r1 =  _mm_loadl_epi64((__m128i *)(src + src_wd));
    175     src_r2 =  _mm_loadl_epi64((__m128i *)(src + 2 * src_wd));
    176     src_r3 =  _mm_loadl_epi64((__m128i *)(src + 3 * src_wd));
    177 
    178     _mm_storel_epi64((__m128i *)dst, src_r0);
    179     _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1);
    180     _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2);
    181     _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3);
    182 
    183     // Row 4-7
    184     src += 4 * src_wd;
    185     dst += 4 * dst_wd;
    186 
    187     src_r0 =  _mm_loadl_epi64((__m128i *)src);
    188     src_r1 =  _mm_loadl_epi64((__m128i *)(src + src_wd));
    189     src_r2 =  _mm_loadl_epi64((__m128i *)(src + 2 * src_wd));
    190     src_r3 =  _mm_loadl_epi64((__m128i *)(src + 3 * src_wd));
    191 
    192     _mm_storel_epi64((__m128i *)dst, src_r0);
    193     _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1);
    194     _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2);
    195     _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3);
    196 }
    197 
    198 /*****************************************************************************/
    199 /*                                                                           */
    200 /*  Function Name : impeg2_interpolate                                       */
    201 /*                                                                           */
    202 /*  Description   : averages the contents of buf_src1 and buf_src2 and stores*/
    203 /*                  result in buf_dst                                        */
    204 /*                                                                           */
    205 /*  Inputs        : buf_src1 -  First Source                                 */
    206 /*                  buf_src2 -  Second Source                                */
    207 /*                                                                           */
    208 /*  Globals       : None                                                     */
    209 /*                                                                           */
    210 /*  Processing    : Avg the values from two sources and store the result in  */
    211 /*                  destination buffer                                       */
    212 /*                                                                           */
    213 /*  Outputs       : buf_dst  -  Avg of contents of buf_src1 and buf_src2     */
    214 /*                                                                           */
    215 /*  Returns       : None                                                     */
    216 /*                                                                           */
    217 /*  Issues        : Assumes that all 3 buffers are of same size              */
    218 /*                                                                           */
    219 /*****************************************************************************/
    220 void impeg2_interpolate_sse42(yuv_buf_t *buf_src1,
    221                         yuv_buf_t *buf_src2,
    222                         yuv_buf_t *buf_dst,
    223                         UWORD32 stride)
    224 {
    225     UWORD8 *src1, *src2;
    226     UWORD8 *dst;
    227     __m128i src1_r0, src1_r1, src1_r2, src1_r3;
    228     __m128i src2_r0, src2_r1, src2_r2, src2_r3;
    229 
    230     /*******************************************************/
    231     /* interpolate Y                                       */
    232     /*******************************************************/
    233     src1 = buf_src1->pu1_y;
    234     src2 = buf_src2->pu1_y;
    235     dst  = buf_dst->pu1_y;
    236     // Row 0-3
    237     src1_r0 = _mm_loadu_si128((__m128i *) (src1));
    238     src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16));
    239     src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16));
    240     src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16));
    241 
    242     src2_r0 = _mm_loadu_si128((__m128i *) (src2));
    243     src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16));
    244     src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16));
    245     src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16));
    246 
    247     src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
    248     src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
    249     src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
    250     src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
    251 
    252     _mm_storeu_si128((__m128i *) dst, src1_r0);
    253     _mm_storeu_si128((__m128i *) (dst + stride), src1_r1);
    254     _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2);
    255     _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3);
    256 
    257     // Row 4-7
    258     src1 += 4 * 16;
    259     src2 += 4 * 16;
    260     dst += 4 * stride;
    261     src1_r0 = _mm_loadu_si128((__m128i *) (src1));
    262     src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16));
    263     src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16));
    264     src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16));
    265 
    266     src2_r0 = _mm_loadu_si128((__m128i *) (src2));
    267     src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16));
    268     src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16));
    269     src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16));
    270 
    271     src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
    272     src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
    273     src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
    274     src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
    275 
    276     _mm_storeu_si128((__m128i *) dst, src1_r0);
    277     _mm_storeu_si128((__m128i *) (dst + stride), src1_r1);
    278     _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2);
    279     _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3);
    280 
    281     // Row 8-11
    282     src1 += 4 * 16;
    283     src2 += 4 * 16;
    284     dst += 4 * stride;
    285     src1_r0 = _mm_loadu_si128((__m128i *) (src1));
    286     src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16));
    287     src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16));
    288     src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16));
    289 
    290     src2_r0 = _mm_loadu_si128((__m128i *) (src2));
    291     src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16));
    292     src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16));
    293     src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16));
    294 
    295     src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
    296     src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
    297     src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
    298     src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
    299 
    300     _mm_storeu_si128((__m128i *) dst, src1_r0);
    301     _mm_storeu_si128((__m128i *) (dst + stride), src1_r1);
    302     _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2);
    303     _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3);
    304 
    305     // Row 12-15
    306     src1 += 4 * 16;
    307     src2 += 4 * 16;
    308     dst += 4 * stride;
    309     src1_r0 = _mm_loadu_si128((__m128i *) (src1));
    310     src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16));
    311     src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16));
    312     src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16));
    313 
    314     src2_r0 = _mm_loadu_si128((__m128i *) (src2));
    315     src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16));
    316     src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16));
    317     src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16));
    318 
    319     src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
    320     src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
    321     src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
    322     src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
    323 
    324     _mm_storeu_si128((__m128i *) dst, src1_r0);
    325     _mm_storeu_si128((__m128i *) (dst + stride), src1_r1);
    326     _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2);
    327     _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3);
    328 
    329     stride >>= 1;
    330 
    331     /*******************************************************/
    332     /* interpolate U                                       */
    333     /*******************************************************/
    334     src1 = buf_src1->pu1_u;
    335     src2 = buf_src2->pu1_u;
    336     dst  = buf_dst->pu1_u;
    337     // Row 0-3
    338     src1_r0 = _mm_loadl_epi64((__m128i *) (src1));
    339     src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8));
    340     src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8));
    341     src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8));
    342 
    343     src2_r0 = _mm_loadl_epi64((__m128i *) (src2));
    344     src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8));
    345     src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8));
    346     src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8));
    347 
    348     src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
    349     src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
    350     src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
    351     src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
    352 
    353     _mm_storel_epi64((__m128i *) dst, src1_r0);
    354     _mm_storel_epi64((__m128i *) (dst + stride), src1_r1);
    355     _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2);
    356     _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3);
    357 
    358     // Row 4-7
    359     src1 += 4 * 8;
    360     src2 += 4 * 8;
    361     dst += 4 * stride;
    362 
    363     src1_r0 = _mm_loadl_epi64((__m128i *) (src1));
    364     src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8));
    365     src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8));
    366     src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8));
    367 
    368     src2_r0 = _mm_loadl_epi64((__m128i *) (src2));
    369     src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8));
    370     src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8));
    371     src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8));
    372 
    373     src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
    374     src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
    375     src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
    376     src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
    377 
    378     _mm_storel_epi64((__m128i *) dst, src1_r0);
    379     _mm_storel_epi64((__m128i *) (dst + stride), src1_r1);
    380     _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2);
    381     _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3);
    382 
    383     /*******************************************************/
    384     /* interpolate V                                       */
    385     /*******************************************************/
    386     src1 = buf_src1->pu1_v;
    387     src2 = buf_src2->pu1_v;
    388     dst  = buf_dst->pu1_v;
    389 
    390     // Row 0-3
    391     src1_r0 = _mm_loadl_epi64((__m128i *) (src1));
    392     src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8));
    393     src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8));
    394     src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8));
    395 
    396     src2_r0 = _mm_loadl_epi64((__m128i *) (src2));
    397     src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8));
    398     src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8));
    399     src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8));
    400 
    401     src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
    402     src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
    403     src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
    404     src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
    405 
    406     _mm_storel_epi64((__m128i *) dst, src1_r0);
    407     _mm_storel_epi64((__m128i *) (dst + stride), src1_r1);
    408     _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2);
    409     _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3);
    410 
    411     // Row 4-7
    412     src1 += 4 * 8;
    413     src2 += 4 * 8;
    414     dst += 4 * stride;
    415 
    416     src1_r0 = _mm_loadl_epi64((__m128i *) (src1));
    417     src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8));
    418     src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8));
    419     src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8));
    420 
    421     src2_r0 = _mm_loadl_epi64((__m128i *) (src2));
    422     src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8));
    423     src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8));
    424     src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8));
    425 
    426     src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
    427     src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
    428     src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
    429     src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
    430 
    431     _mm_storel_epi64((__m128i *) dst, src1_r0);
    432     _mm_storel_epi64((__m128i *) (dst + stride), src1_r1);
    433     _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2);
    434     _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3);
    435 }
    436 
    437 /*****************************************************************************/
    438 /*                                                                           */
    439 /*  Function Name : impeg2_mc_halfx_halfy_8x8_sse42()                                 */
    440 /*                                                                           */
    441 /*  Description   : Gets the buffer from (0.5,0.5) to (8.5,8.5)              */
    442 /*                  and the above block of size 8 x 8 will be placed as a    */
    443 /*                  block from the current position of out_buf               */
    444 /*                                                                           */
    445 /*  Inputs        : ref - Reference frame from which the block will be       */
    446 /*                        block will be extracted.                           */
    447 /*                  ref_wid - WIdth of reference frame                       */
    448 /*                  out_wid - WIdth of the output frame                      */
    449 /*                  blk_width  - width of the block                          */
    450 /*                  blk_width  - height of the block                         */
    451 /*                                                                           */
    452 /*  Globals       : None                                                     */
    453 /*                                                                           */
    454 /*  Processing    : Point to the (0,0),(1,0),(0,1),(1,1) position in         */
    455 /*                  the ref frame.Interpolate these four values to get the   */
    456 /*                  value at(0.5,0.5).Repeat this to get an 8 x 8 block      */
    457 /*                  using 9 x 9 block from reference frame                   */
    458 /*                                                                           */
    459 /*  Outputs       : out -  Output containing the extracted block             */
    460 /*                                                                           */
    461 /*  Returns       : None                                                     */
    462 /*                                                                           */
    463 /*  Issues        : None                                                     */
    464 /*                                                                           */
    465 /*****************************************************************************/
    466 void impeg2_mc_halfx_halfy_8x8_sse42(UWORD8 *out,
    467                             UWORD8 *ref,
    468                             UWORD32 ref_wid,
    469                             UWORD32 out_wid)
    470 {
    471     UWORD8 *ref_p0,*ref_p1,*ref_p2,*ref_p3;
    472     /* P0-P3 are the pixels in the reference frame and Q is the value being */
    473     /* estimated                                                            */
    474     /*
    475        P0 P1
    476          Q
    477        P2 P3
    478     */
    479     __m128i src_r0, src_r0_1, src_r1, src_r1_1;
    480     __m128i tmp0, tmp1;
    481     __m128i value_2 = _mm_set1_epi16(2);
    482 
    483     ref_p0 = ref;
    484     ref_p1 = ref + 1;
    485     ref_p2 = ref + ref_wid;
    486     ref_p3 = ref + ref_wid + 1;
    487 
    488     src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0));     //Row 0
    489     src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
    490     src_r1 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 1
    491     src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
    492 
    493     src_r0 =  _mm_cvtepu8_epi16(src_r0);
    494     src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
    495     src_r1 =  _mm_cvtepu8_epi16(src_r1);
    496     src_r1_1 =  _mm_cvtepu8_epi16(src_r1_1);
    497 
    498     tmp0 = _mm_add_epi16(src_r0, src_r0_1);             //Row 0 horizontal interpolation
    499     tmp1 = _mm_add_epi16(src_r1, src_r1_1);             //Row 1 horizontal interpolation
    500     tmp0 = _mm_add_epi16(tmp0, tmp1);                   //Row 0 vertical interpolation
    501     tmp0 = _mm_add_epi16(tmp0, value_2);
    502     tmp0 =  _mm_srli_epi16(tmp0, 2);
    503     tmp0 = _mm_packus_epi16(tmp0, value_2);
    504 
    505     _mm_storel_epi64((__m128i *)out, tmp0);
    506 
    507     //Row 1
    508     ref_p2 += ref_wid;
    509     ref_p3 += ref_wid;
    510     out += out_wid;
    511 
    512     src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 2
    513     src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
    514 
    515     src_r0 =  _mm_cvtepu8_epi16(src_r0);
    516     src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
    517 
    518     tmp0 = _mm_add_epi16(src_r0, src_r0_1);         //Row 2 horizontal interpolation
    519     tmp1 = _mm_add_epi16(tmp0, tmp1);               //Row 1 vertical interpolation
    520     tmp1 = _mm_add_epi16(tmp1, value_2);
    521     tmp1 =  _mm_srli_epi16(tmp1, 2);
    522     tmp1 = _mm_packus_epi16(tmp1, value_2);
    523 
    524     _mm_storel_epi64((__m128i *)out, tmp1);
    525 
    526     //Row 2
    527     ref_p2 += ref_wid;
    528     ref_p3 += ref_wid;
    529     out += out_wid;
    530 
    531     src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 3
    532     src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
    533 
    534     src_r0 =  _mm_cvtepu8_epi16(src_r0);
    535     src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
    536 
    537     tmp1 = _mm_add_epi16(src_r0, src_r0_1);         //Row 3 horizontal interpolation
    538 
    539     tmp0 = _mm_add_epi16(tmp0, tmp1);               //Row 2 vertical interpolation
    540     tmp0 = _mm_add_epi16(tmp0, value_2);
    541     tmp0 =  _mm_srli_epi16(tmp0, 2);
    542     tmp0 = _mm_packus_epi16(tmp0, value_2);
    543 
    544     _mm_storel_epi64((__m128i *)out, tmp0);
    545 
    546     //Row 3
    547     ref_p2 += ref_wid;
    548     ref_p3 += ref_wid;
    549     out += out_wid;
    550 
    551     src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 4
    552     src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
    553 
    554     src_r0 =  _mm_cvtepu8_epi16(src_r0);
    555     src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
    556 
    557     tmp0 = _mm_add_epi16(src_r0, src_r0_1);         //Row 4 horizontal interpolation
    558 
    559     tmp1 = _mm_add_epi16(tmp0, tmp1);               //Row 3 vertical interpolation
    560     tmp1 = _mm_add_epi16(tmp1, value_2);
    561     tmp1 =  _mm_srli_epi16(tmp1, 2);
    562     tmp1 = _mm_packus_epi16(tmp1, value_2);
    563 
    564     _mm_storel_epi64((__m128i *)out, tmp1);
    565 
    566     //Row 4
    567     ref_p2 += ref_wid;
    568     ref_p3 += ref_wid;
    569     out += out_wid;
    570 
    571     src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 5
    572     src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
    573 
    574     src_r0 =  _mm_cvtepu8_epi16(src_r0);
    575     src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
    576 
    577     tmp1 = _mm_add_epi16(src_r0, src_r0_1);     //Row 5 horizontal interpolation
    578 
    579     tmp0 = _mm_add_epi16(tmp0, tmp1);           //Row 4 vertical interpolation
    580     tmp0 = _mm_add_epi16(tmp0, value_2);
    581     tmp0 =  _mm_srli_epi16(tmp0, 2);
    582     tmp0 = _mm_packus_epi16(tmp0, value_2);
    583 
    584     _mm_storel_epi64((__m128i *)out, tmp0);
    585 
    586     //Row 5
    587     ref_p2 += ref_wid;
    588     ref_p3 += ref_wid;
    589     out += out_wid;
    590 
    591     src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 6
    592     src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
    593 
    594     src_r0 =  _mm_cvtepu8_epi16(src_r0);
    595     src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
    596 
    597     tmp0 = _mm_add_epi16(src_r0, src_r0_1);             //Row 6 horizontal interpolation
    598 
    599     tmp1 = _mm_add_epi16(tmp0, tmp1);                   //Row 5 vertical interpolation
    600     tmp1 = _mm_add_epi16(tmp1, value_2);
    601     tmp1 =  _mm_srli_epi16(tmp1, 2);
    602     tmp1 = _mm_packus_epi16(tmp1, value_2);
    603 
    604     _mm_storel_epi64((__m128i *)out, tmp1);
    605 
    606     //Row 6
    607     ref_p2 += ref_wid;
    608     ref_p3 += ref_wid;
    609     out += out_wid;
    610 
    611     src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 7
    612     src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
    613 
    614     src_r0 =  _mm_cvtepu8_epi16(src_r0);
    615     src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
    616 
    617     tmp1 = _mm_add_epi16(src_r0, src_r0_1);             //Row 7 horizontal interpolation
    618 
    619     tmp0 = _mm_add_epi16(tmp0, tmp1);                   //Row 6 vertical interpolation
    620     tmp0 = _mm_add_epi16(tmp0, value_2);
    621     tmp0 =  _mm_srli_epi16(tmp0, 2);
    622     tmp0 = _mm_packus_epi16(tmp0, value_2);
    623 
    624     _mm_storel_epi64((__m128i *)out, tmp0);
    625 
    626     //Row 7
    627     ref_p2 += ref_wid;
    628     ref_p3 += ref_wid;
    629     out += out_wid;
    630 
    631     src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 8
    632     src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
    633 
    634     src_r0 =  _mm_cvtepu8_epi16(src_r0);
    635     src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
    636 
    637     tmp0 = _mm_add_epi16(src_r0, src_r0_1);             //Row 8 horizontal interpolation
    638 
    639     tmp1 = _mm_add_epi16(tmp0, tmp1);                   //Row 7 vertical interpolation
    640     tmp1 = _mm_add_epi16(tmp1, value_2);
    641     tmp1 =  _mm_srli_epi16(tmp1, 2);
    642     tmp1 = _mm_packus_epi16(tmp1, value_2);
    643 
    644     _mm_storel_epi64((__m128i *)out, tmp1);
    645 
    646     return;
    647 }
    648 
    649 /*****************************************************************************/
    650 /*                                                                           */
    651 /*  Function Name : impeg2_mc_halfx_fully_8x8_sse42()                                 */
    652 /*                                                                           */
    653 /*  Description   : Gets the buffer from (0.5,0) to (8.5,8)                  */
    654 /*                  and the above block of size 8 x 8 will be placed as a    */
    655 /*                  block from the current position of out_buf               */
    656 /*                                                                           */
    657 /*  Inputs        : ref - Reference frame from which the block will be       */
    658 /*                        block will be extracted.                           */
    659 /*                  ref_wid - WIdth of reference frame                       */
    660 /*                  out_wid - WIdth of the output frame                      */
    661 /*                  blk_width  - width of the block                          */
    662 /*                  blk_width  - height of the block                         */
    663 /*                                                                           */
    664 /*  Globals       : None                                                     */
    665 /*                                                                           */
    666 /*  Processing    : Point to the (0,0) and (1,0) position in the ref frame   */
    667 /*                  Interpolate these two values to get the value at(0.5,0)  */
    668 /*                  Repeat this to get an 8 x 8 block using 9 x 8 block from */
    669 /*                  reference frame                                          */
    670 /*                                                                           */
    671 /*  Outputs       : out -  Output containing the extracted block             */
    672 /*                                                                           */
    673 /*  Returns       : None                                                     */
    674 /*                                                                           */
    675 /*  Issues        : None                                                     */
    676 /*                                                                           */
    677 /*****************************************************************************/
    678 void impeg2_mc_halfx_fully_8x8_sse42(UWORD8 *out,
    679                             UWORD8 *ref,
    680                             UWORD32 ref_wid,
    681                             UWORD32 out_wid)
    682 {
    683     UWORD8 *ref_p0,*ref_p1;
    684     __m128i src_r0, src_r0_1, src_r1, src_r1_1;
    685     /* P0-P3 are the pixels in the reference frame and Q is the value being */
    686     /* estimated                                                            */
    687     /*
    688        P0 Q P1
    689     */
    690 
    691     ref_p0 = ref;
    692     ref_p1 = ref + 1;
    693 
    694     // Row 0 and 1
    695     src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0));     //Row 0
    696     src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
    697     src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid));       //Row 1
    698     src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid));
    699 
    700     src_r0 = _mm_avg_epu8(src_r0, src_r0_1);
    701     src_r1 = _mm_avg_epu8(src_r1, src_r1_1);
    702 
    703     _mm_storel_epi64((__m128i *)out, src_r0);
    704     _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
    705 
    706     // Row 2 and 3
    707     ref_p0 += 2*ref_wid;
    708     ref_p1 += 2*ref_wid;
    709     out += 2*out_wid;
    710 
    711     src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0));     //Row 2
    712     src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
    713     src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid));       //Row 3
    714     src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid));
    715 
    716     src_r0 = _mm_avg_epu8(src_r0, src_r0_1);
    717     src_r1 = _mm_avg_epu8(src_r1, src_r1_1);
    718 
    719     _mm_storel_epi64((__m128i *)out, src_r0);
    720     _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
    721 
    722     // Row 4 and 5
    723     ref_p0 += 2*ref_wid;
    724     ref_p1 += 2*ref_wid;
    725     out += 2*out_wid;
    726 
    727     src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0));     //Row 4
    728     src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
    729     src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid));       //Row 5
    730     src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid));
    731 
    732     src_r0 = _mm_avg_epu8(src_r0, src_r0_1);
    733     src_r1 = _mm_avg_epu8(src_r1, src_r1_1);
    734 
    735     _mm_storel_epi64((__m128i *)out, src_r0);
    736     _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
    737 
    738     // Row 6 and 7
    739     ref_p0 += 2*ref_wid;
    740     ref_p1 += 2*ref_wid;
    741     out += 2*out_wid;
    742 
    743     src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0));     //Row 6
    744     src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
    745     src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid));       //Row 7
    746     src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid));
    747 
    748     src_r0 = _mm_avg_epu8(src_r0, src_r0_1);
    749     src_r1 = _mm_avg_epu8(src_r1, src_r1_1);
    750 
    751     _mm_storel_epi64((__m128i *)out, src_r0);
    752     _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
    753 
    754     return;
    755 }
    756 
    757 
    758 /*****************************************************************************/
    759 /*                                                                           */
    760 /*  Function Name : impeg2_mc_fullx_halfy_8x8_sse42()                                 */
    761 /*                                                                           */
    762 /*  Description   : Gets the buffer from (0,0.5) to (8,8.5)                  */
    763 /*                  and the above block of size 8 x 8 will be placed as a    */
    764 /*                  block from the current position of out_buf               */
    765 /*                                                                           */
    766 /*  Inputs        : ref - Reference frame from which the block will be       */
    767 /*                        block will be extracted.                           */
    768 /*                  ref_wid - WIdth of reference frame                       */
    769 /*                  out_wid - WIdth of the output frame                      */
    770 /*                  blk_width  - width of the block                          */
    771 /*                  blk_width  - height of the block                         */
    772 /*                                                                           */
    773 /*  Globals       : None                                                     */
    774 /*                                                                           */
    775 /*  Processing    : Point to the (0,0) and (0,1)   position in the ref frame */
    776 /*                  Interpolate these two values to get the value at(0,0.5)  */
    777 /*                  Repeat this to get an 8 x 8 block using 8 x 9 block from */
    778 /*                  reference frame                                          */
    779 /*                                                                           */
    780 /*  Outputs       : out -  Output containing the extracted block             */
    781 /*                                                                           */
    782 /*  Returns       : None                                                     */
    783 /*                                                                           */
    784 /*  Issues        : None                                                     */
    785 /*                                                                           */
    786 /*****************************************************************************/
    787 void impeg2_mc_fullx_halfy_8x8_sse42(UWORD8 *out,
    788                             UWORD8 *ref,
    789                             UWORD32 ref_wid,
    790                             UWORD32 out_wid)
    791 {
    792     __m128i src_r0, src_r1, src_r2, temp0, temp1;
    793     /* P0-P3 are the pixels in the reference frame and Q is the value being */
    794     /* estimated                                                            */
    795     /*
    796        P0
    797         x
    798        P1
    799     */
    800     src_r0 = _mm_loadl_epi64((__m128i *)ref);               //Row 0
    801     src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid));   //Row 1
    802     src_r2 = _mm_loadl_epi64((__m128i *)(ref + 2 * ref_wid));   //Row 2
    803     temp0 = _mm_avg_epu8(src_r0, src_r1);
    804     temp1 = _mm_avg_epu8(src_r1, src_r2);
    805     _mm_storel_epi64((__m128i *)out, temp0);                //Row 0
    806     _mm_storel_epi64((__m128i *)(out + out_wid), temp1);    //Row 1
    807 
    808     ref+= 3*ref_wid;
    809     out+= 2*out_wid;
    810 
    811     src_r0 = _mm_loadl_epi64((__m128i *)ref);               //Row 3
    812     src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid));   //Row 4
    813     temp0 = _mm_avg_epu8(src_r2, src_r0);
    814     temp1 = _mm_avg_epu8(src_r0, src_r1);
    815     _mm_storel_epi64((__m128i *)out, temp0);                //Row 2
    816     _mm_storel_epi64((__m128i *)(out + out_wid), temp1);    //Row 3
    817 
    818     ref += 2*ref_wid;
    819     out+= 2*out_wid;
    820 
    821     src_r2 = _mm_loadl_epi64((__m128i *)ref);               //Row 5
    822     src_r0 = _mm_loadl_epi64((__m128i *)(ref + ref_wid));   //Row 6
    823     temp0 = _mm_avg_epu8(src_r1, src_r2);
    824     temp1 = _mm_avg_epu8(src_r2, src_r0);
    825     _mm_storel_epi64((__m128i *)out, temp0);                //Row 4
    826     _mm_storel_epi64((__m128i *)(out + out_wid), temp1);    //Row 5
    827 
    828     ref += 2*ref_wid;
    829     out+= 2*out_wid;
    830 
    831     src_r1 = _mm_loadl_epi64((__m128i *)ref);               //Row 7
    832     src_r2 = _mm_loadl_epi64((__m128i *) (ref + ref_wid));  //Row 8
    833     temp0 = _mm_avg_epu8(src_r0, src_r1);
    834     temp1 = _mm_avg_epu8(src_r1, src_r2);
    835     _mm_storel_epi64((__m128i *)out, temp0);                //Row 6
    836     _mm_storel_epi64((__m128i *)(out + out_wid), temp1);    //Row 7
    837 
    838     return;
    839 }
    840 
    841 /*****************************************************************************/
    842 /*                                                                           */
    843 /*  Function Name : impeg2_mc_fullx_fully_8x8_sse42()                                 */
    844 /*                                                                           */
    845 /*  Description   : Gets the buffer from (x,y) to (x+8,y+8)                  */
    846 /*                  and the above block of size 8 x 8 will be placed as a    */
    847 /*                  block from the current position of out_buf               */
    848 /*                                                                           */
    849 /*  Inputs        : ref - Reference frame from which the block will be       */
    850 /*                        block will be extracted.                           */
    851 /*                  ref_wid - WIdth of reference frame                       */
    852 /*                  out_wid - WIdth of the output frame                      */
    853 /*                  blk_width  - width of the block                          */
    854 /*                  blk_width  - height of the block                         */
    855 /*                                                                           */
    856 /*  Globals       : None                                                     */
    857 /*                                                                           */
    858 /*  Processing    : Point to the (0,0) position in the ref frame             */
    859 /*                  Get an 8 x 8 block from reference frame                  */
    860 /*                                                                           */
    861 /*  Outputs       : out -  Output containing the extracted block             */
    862 /*                                                                           */
    863 /*  Returns       : None                                                     */
    864 /*                                                                           */
    865 /*  Issues        : None                                                     */
    866 /*                                                                           */
    867 /*****************************************************************************/
    868 void impeg2_mc_fullx_fully_8x8_sse42(UWORD8 *out,
    869                             UWORD8 *ref,
    870                             UWORD32 ref_wid,
    871                             UWORD32 out_wid)
    872 {
    873     __m128i src_r0, src_r1, src_r2, src_r3;
    874     // Row 0-3
    875     src_r0 =  _mm_loadl_epi64((__m128i *)ref);
    876     src_r1 =  _mm_loadl_epi64((__m128i *)(ref + ref_wid));
    877     src_r2 =  _mm_loadl_epi64((__m128i *)(ref + 2 * ref_wid));
    878     src_r3 =  _mm_loadl_epi64((__m128i *)(ref + 3 * ref_wid));
    879 
    880     _mm_storel_epi64((__m128i *)out, src_r0);
    881     _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
    882     _mm_storel_epi64((__m128i *)(out + 2 * out_wid), src_r2);
    883     _mm_storel_epi64((__m128i *)(out + 3 * out_wid), src_r3);
    884 
    885     // Row 4-7
    886     ref += 4 * ref_wid;
    887     out += 4 * out_wid;
    888 
    889     src_r0 =  _mm_loadl_epi64((__m128i *)ref);
    890     src_r1 =  _mm_loadl_epi64((__m128i *)(ref + ref_wid));
    891     src_r2 =  _mm_loadl_epi64((__m128i *)(ref + 2 * ref_wid));
    892     src_r3 =  _mm_loadl_epi64((__m128i *)(ref + 3 * ref_wid));
    893 
    894     _mm_storel_epi64((__m128i *)out, src_r0);
    895     _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
    896     _mm_storel_epi64((__m128i *)(out + 2 * out_wid), src_r2);
    897     _mm_storel_epi64((__m128i *)(out + 3 * out_wid), src_r3);
    898     return;
    899 }
    900