1 /* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <emmintrin.h> 12 #include <stdio.h> 13 14 #include "vp9/common/vp9_common.h" 15 16 int64_t vp9_highbd_block_error_sse2(tran_low_t *coeff, tran_low_t *dqcoeff, 17 intptr_t block_size, int64_t *ssz, 18 int bps) { 19 int i, j, test; 20 uint32_t temp[4]; 21 __m128i max, min, cmp0, cmp1, cmp2, cmp3; 22 int64_t error = 0, sqcoeff = 0; 23 const int shift = 2 * (bps - 8); 24 const int rounding = shift > 0 ? 1 << (shift - 1) : 0; 25 26 for (i = 0; i < block_size; i += 8) { 27 // Load the data into xmm registers 28 __m128i mm_coeff = _mm_load_si128((__m128i *)(coeff + i)); 29 __m128i mm_coeff2 = _mm_load_si128((__m128i *)(coeff + i + 4)); 30 __m128i mm_dqcoeff = _mm_load_si128((__m128i *)(dqcoeff + i)); 31 __m128i mm_dqcoeff2 = _mm_load_si128((__m128i *)(dqcoeff + i + 4)); 32 // Check if any values require more than 15 bit 33 max = _mm_set1_epi32(0x3fff); 34 min = _mm_set1_epi32(0xffffc000); 35 cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max), 36 _mm_cmplt_epi32(mm_coeff, min)); 37 cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max), 38 _mm_cmplt_epi32(mm_coeff2, min)); 39 cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max), 40 _mm_cmplt_epi32(mm_dqcoeff, min)); 41 cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max), 42 _mm_cmplt_epi32(mm_dqcoeff2, min)); 43 test = _mm_movemask_epi8( 44 _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3))); 45 46 if (!test) { 47 __m128i mm_diff, error_sse2, sqcoeff_sse2; 48 mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2); 49 mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2); 50 mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff); 51 error_sse2 = _mm_madd_epi16(mm_diff, mm_diff); 52 sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff); 53 _mm_storeu_si128((__m128i *)temp, error_sse2); 54 error = error + temp[0] + temp[1] + temp[2] + temp[3]; 55 _mm_storeu_si128((__m128i *)temp, sqcoeff_sse2); 56 sqcoeff += temp[0] + temp[1] + temp[2] + temp[3]; 57 } else { 58 for (j = 0; j < 8; j++) { 59 const int64_t diff = coeff[i + j] - dqcoeff[i + j]; 60 error += diff * diff; 61 sqcoeff += (int64_t)coeff[i + j] * (int64_t)coeff[i + j]; 62 } 63 } 64 } 65 assert(error >= 0 && sqcoeff >= 0); 66 error = (error + rounding) >> shift; 67 sqcoeff = (sqcoeff + rounding) >> shift; 68 69 *ssz = sqcoeff; 70 return error; 71 } 72