1 /* 2 * Copyright (c) 2009-2011 Intel Corporation. All rights reserved. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include <emmintrin.h> 18 #include <x86intrin.h> 19 20 inline void stream_memcpy(void* dst_buff, const void* src_buff, size_t size) 21 { 22 bool isAligned = (((size_t)(src_buff) | (size_t)(dst_buff)) & 0xF) == 0; 23 if (!isAligned) { 24 memcpy(dst_buff, src_buff, size); 25 return; 26 } 27 28 static const size_t regs_count = 8; 29 30 __m128i xmm_data0, xmm_data1, xmm_data2, xmm_data3; 31 __m128i xmm_data4, xmm_data5, xmm_data6, xmm_data7; 32 33 size_t remain_data = size & (regs_count * sizeof(xmm_data0) - 1); 34 size_t end_position = 0; 35 36 __m128i* pWb_buff = (__m128i*)dst_buff; 37 __m128i* pWb_buff_end = pWb_buff + ((size - remain_data) >> 4); 38 __m128i* pWc_buff = (__m128i*)src_buff; 39 40 /*sync the wc memory data*/ 41 _mm_mfence(); 42 43 while (pWb_buff < pWb_buff_end) 44 { 45 xmm_data0 = _mm_stream_load_si128(pWc_buff); 46 xmm_data1 = _mm_stream_load_si128(pWc_buff + 1); 47 xmm_data2 = _mm_stream_load_si128(pWc_buff + 2); 48 xmm_data3 = _mm_stream_load_si128(pWc_buff + 3); 49 xmm_data4 = _mm_stream_load_si128(pWc_buff + 4); 50 xmm_data5 = _mm_stream_load_si128(pWc_buff + 5); 51 xmm_data6 = _mm_stream_load_si128(pWc_buff + 6); 52 xmm_data7 = _mm_stream_load_si128(pWc_buff + 7); 53 54 pWc_buff += regs_count; 55 _mm_store_si128(pWb_buff, xmm_data0); 56 _mm_store_si128(pWb_buff + 1, xmm_data1); 57 _mm_store_si128(pWb_buff + 2, xmm_data2); 58 _mm_store_si128(pWb_buff + 3, xmm_data3); 59 _mm_store_si128(pWb_buff + 4, xmm_data4); 60 _mm_store_si128(pWb_buff + 5, xmm_data5); 61 _mm_store_si128(pWb_buff + 6, xmm_data6); 62 _mm_store_si128(pWb_buff + 7, xmm_data7); 63 64 pWb_buff += regs_count; 65 } 66 67 /*copy data by 16 bytes step from the remainder*/ 68 if (remain_data >= 16) 69 { 70 size = remain_data; 71 remain_data = size & 15; 72 end_position = size >> 4; 73 for (size_t i = 0; i < end_position; ++i) 74 { 75 pWb_buff[i] = _mm_stream_load_si128(pWc_buff + i); 76 } 77 } 78 79 /*copy the remainder data, if it still existed*/ 80 if (remain_data) 81 { 82 __m128i temp_data = _mm_stream_load_si128(pWc_buff + end_position); 83 84 char* psrc_buf = (char*)(&temp_data); 85 char* pdst_buf = (char*)(pWb_buff + end_position); 86 87 for (size_t i = 0; i < remain_data; ++i) 88 { 89 pdst_buf[i] = psrc_buf[i]; 90 } 91 } 92 93 } 94