Home | History | Annotate | Download | only in videodecoder
      1 /*
      2 * Copyright (c) 2009-2011 Intel Corporation.  All rights reserved.
      3 *
      4 * Licensed under the Apache License, Version 2.0 (the "License");
      5 * you may not use this file except in compliance with the License.
      6 * You may obtain a copy of the License at
      7 *
      8 * http://www.apache.org/licenses/LICENSE-2.0
      9 *
     10 * Unless required by applicable law or agreed to in writing, software
     11 * distributed under the License is distributed on an "AS IS" BASIS,
     12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 * See the License for the specific language governing permissions and
     14 * limitations under the License.
     15 */
     16 
     17 #include <emmintrin.h>
     18 #include <x86intrin.h>
     19 
     20 inline void stream_memcpy(void* dst_buff, const void* src_buff, size_t size)
     21 {
     22     bool isAligned = (((size_t)(src_buff) | (size_t)(dst_buff)) & 0xF) == 0;
     23     if (!isAligned) {
     24         memcpy(dst_buff, src_buff, size);
     25         return;
     26     }
     27 
     28     static const size_t regs_count = 8;
     29 
     30     __m128i xmm_data0, xmm_data1, xmm_data2, xmm_data3;
     31     __m128i xmm_data4, xmm_data5, xmm_data6, xmm_data7;
     32 
     33     size_t remain_data = size & (regs_count * sizeof(xmm_data0) - 1);
     34     size_t end_position = 0;
     35 
     36     __m128i* pWb_buff = (__m128i*)dst_buff;
     37     __m128i* pWb_buff_end = pWb_buff + ((size - remain_data) >> 4);
     38     __m128i* pWc_buff = (__m128i*)src_buff;
     39 
     40     /*sync the wc memory data*/
     41     _mm_mfence();
     42 
     43     while (pWb_buff < pWb_buff_end)
     44     {
     45         xmm_data0  = _mm_stream_load_si128(pWc_buff);
     46         xmm_data1  = _mm_stream_load_si128(pWc_buff + 1);
     47         xmm_data2  = _mm_stream_load_si128(pWc_buff + 2);
     48         xmm_data3  = _mm_stream_load_si128(pWc_buff + 3);
     49         xmm_data4  = _mm_stream_load_si128(pWc_buff + 4);
     50         xmm_data5  = _mm_stream_load_si128(pWc_buff + 5);
     51         xmm_data6  = _mm_stream_load_si128(pWc_buff + 6);
     52         xmm_data7  = _mm_stream_load_si128(pWc_buff + 7);
     53 
     54         pWc_buff += regs_count;
     55         _mm_store_si128(pWb_buff, xmm_data0);
     56         _mm_store_si128(pWb_buff + 1, xmm_data1);
     57         _mm_store_si128(pWb_buff + 2, xmm_data2);
     58         _mm_store_si128(pWb_buff + 3, xmm_data3);
     59         _mm_store_si128(pWb_buff + 4, xmm_data4);
     60         _mm_store_si128(pWb_buff + 5, xmm_data5);
     61         _mm_store_si128(pWb_buff + 6, xmm_data6);
     62         _mm_store_si128(pWb_buff + 7, xmm_data7);
     63 
     64         pWb_buff += regs_count;
     65     }
     66 
     67     /*copy data by 16 bytes step from the remainder*/
     68     if (remain_data >= 16)
     69     {
     70         size = remain_data;
     71         remain_data = size & 15;
     72         end_position = size >> 4;
     73         for (size_t i = 0; i < end_position; ++i)
     74         {
     75             pWb_buff[i] = _mm_stream_load_si128(pWc_buff + i);
     76         }
     77     }
     78 
     79     /*copy the remainder data, if it still existed*/
     80     if (remain_data)
     81     {
     82         __m128i temp_data = _mm_stream_load_si128(pWc_buff + end_position);
     83 
     84         char* psrc_buf = (char*)(&temp_data);
     85         char* pdst_buf = (char*)(pWb_buff + end_position);
     86 
     87         for (size_t i = 0; i < remain_data; ++i)
     88         {
     89             pdst_buf[i] = psrc_buf[i];
     90         }
     91     }
     92 
     93 }
     94