1 /* 2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <assert.h> 12 #include <emmintrin.h> 13 14 #include "./vpx_dsp_rtcd.h" 15 #include "vpx/vpx_integer.h" 16 17 void vpx_comp_avg_pred_sse2(uint8_t *comp, const uint8_t *pred, int width, 18 int height, const uint8_t *ref, int ref_stride) { 19 /* comp and pred must be 16 byte aligned. */ 20 assert(((intptr_t)comp & 0xf) == 0); 21 assert(((intptr_t)pred & 0xf) == 0); 22 if (width > 8) { 23 int x, y; 24 for (y = 0; y < height; ++y) { 25 for (x = 0; x < width; x += 16) { 26 const __m128i p = _mm_load_si128((const __m128i *)(pred + x)); 27 const __m128i r = _mm_loadu_si128((const __m128i *)(ref + x)); 28 const __m128i avg = _mm_avg_epu8(p, r); 29 _mm_store_si128((__m128i *)(comp + x), avg); 30 } 31 comp += width; 32 pred += width; 33 ref += ref_stride; 34 } 35 } else { // width must be 4 or 8. 36 int i; 37 // Process 16 elements at a time. comp and pred have width == stride and 38 // therefore live in contigious memory. 4*4, 4*8, 8*4, 8*8, and 8*16 are all 39 // divisible by 16 so just ref needs to be massaged when loading. 40 for (i = 0; i < width * height; i += 16) { 41 const __m128i p = _mm_load_si128((const __m128i *)pred); 42 __m128i r; 43 __m128i avg; 44 if (width == ref_stride) { 45 r = _mm_loadu_si128((const __m128i *)ref); 46 ref += 16; 47 } else if (width == 4) { 48 r = _mm_set_epi32(*(const uint32_t *)(ref + 3 * ref_stride), 49 *(const uint32_t *)(ref + 2 * ref_stride), 50 *(const uint32_t *)(ref + ref_stride), 51 *(const uint32_t *)(ref)); 52 53 ref += 4 * ref_stride; 54 } else { 55 const __m128i r_0 = _mm_loadl_epi64((const __m128i *)ref); 56 assert(width == 8); 57 r = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(r_0), 58 (const __m64 *)(ref + ref_stride))); 59 60 ref += 2 * ref_stride; 61 } 62 avg = _mm_avg_epu8(p, r); 63 _mm_store_si128((__m128i *)comp, avg); 64 65 pred += 16; 66 comp += 16; 67 } 68 } 69 } 70