1 /* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <assert.h> 12 #include <immintrin.h> 13 14 #include "./vp9_rtcd.h" 15 #include "vpx/vpx_integer.h" 16 #include "vpx_dsp/vpx_dsp_common.h" 17 #include "vpx_dsp/x86/bitdepth_conversion_avx2.h" 18 19 int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, 20 intptr_t block_size, int64_t *ssz) { 21 __m256i sse_256, ssz_256; 22 __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi; 23 __m256i sse_hi, ssz_hi; 24 __m128i sse_128, ssz_128; 25 int64_t sse; 26 const __m256i zero = _mm256_setzero_si256(); 27 28 // If the block size is 16 then the results will fit in 32 bits. 29 if (block_size == 16) { 30 __m256i coeff_256, dqcoeff_256, coeff_hi, dqcoeff_hi; 31 // Load 16 elements for coeff and dqcoeff. 32 coeff_256 = load_tran_low(coeff); 33 dqcoeff_256 = load_tran_low(dqcoeff); 34 // dqcoeff - coeff 35 dqcoeff_256 = _mm256_sub_epi16(dqcoeff_256, coeff_256); 36 // madd (dqcoeff - coeff) 37 dqcoeff_256 = _mm256_madd_epi16(dqcoeff_256, dqcoeff_256); 38 // madd coeff 39 coeff_256 = _mm256_madd_epi16(coeff_256, coeff_256); 40 // Save the higher 64 bit of each 128 bit lane. 41 dqcoeff_hi = _mm256_srli_si256(dqcoeff_256, 8); 42 coeff_hi = _mm256_srli_si256(coeff_256, 8); 43 // Add the higher 64 bit to the low 64 bit. 44 dqcoeff_256 = _mm256_add_epi32(dqcoeff_256, dqcoeff_hi); 45 coeff_256 = _mm256_add_epi32(coeff_256, coeff_hi); 46 // Expand each double word in the lower 64 bits to quad word. 47 sse_256 = _mm256_unpacklo_epi32(dqcoeff_256, zero); 48 ssz_256 = _mm256_unpacklo_epi32(coeff_256, zero); 49 } else { 50 int i; 51 assert(block_size % 32 == 0); 52 sse_256 = zero; 53 ssz_256 = zero; 54 55 for (i = 0; i < block_size; i += 32) { 56 __m256i coeff_0, coeff_1, dqcoeff_0, dqcoeff_1; 57 // Load 32 elements for coeff and dqcoeff. 58 coeff_0 = load_tran_low(coeff + i); 59 dqcoeff_0 = load_tran_low(dqcoeff + i); 60 coeff_1 = load_tran_low(coeff + i + 16); 61 dqcoeff_1 = load_tran_low(dqcoeff + i + 16); 62 // dqcoeff - coeff 63 dqcoeff_0 = _mm256_sub_epi16(dqcoeff_0, coeff_0); 64 dqcoeff_1 = _mm256_sub_epi16(dqcoeff_1, coeff_1); 65 // madd (dqcoeff - coeff) 66 dqcoeff_0 = _mm256_madd_epi16(dqcoeff_0, dqcoeff_0); 67 dqcoeff_1 = _mm256_madd_epi16(dqcoeff_1, dqcoeff_1); 68 // madd coeff 69 coeff_0 = _mm256_madd_epi16(coeff_0, coeff_0); 70 coeff_1 = _mm256_madd_epi16(coeff_1, coeff_1); 71 // Add the first madd (dqcoeff - coeff) with the second. 72 dqcoeff_0 = _mm256_add_epi32(dqcoeff_0, dqcoeff_1); 73 // Add the first madd (coeff) with the second. 74 coeff_0 = _mm256_add_epi32(coeff_0, coeff_1); 75 // Expand each double word of madd (dqcoeff - coeff) to quad word. 76 exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_0, zero); 77 exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_0, zero); 78 // expand each double word of madd (coeff) to quad word 79 exp_coeff_lo = _mm256_unpacklo_epi32(coeff_0, zero); 80 exp_coeff_hi = _mm256_unpackhi_epi32(coeff_0, zero); 81 // Add each quad word of madd (dqcoeff - coeff) and madd (coeff). 82 sse_256 = _mm256_add_epi64(sse_256, exp_dqcoeff_lo); 83 ssz_256 = _mm256_add_epi64(ssz_256, exp_coeff_lo); 84 sse_256 = _mm256_add_epi64(sse_256, exp_dqcoeff_hi); 85 ssz_256 = _mm256_add_epi64(ssz_256, exp_coeff_hi); 86 } 87 } 88 // Save the higher 64 bit of each 128 bit lane. 89 sse_hi = _mm256_srli_si256(sse_256, 8); 90 ssz_hi = _mm256_srli_si256(ssz_256, 8); 91 // Add the higher 64 bit to the low 64 bit. 92 sse_256 = _mm256_add_epi64(sse_256, sse_hi); 93 ssz_256 = _mm256_add_epi64(ssz_256, ssz_hi); 94 95 // Add each 64 bit from each of the 128 bit lane of the 256 bit. 96 sse_128 = _mm_add_epi64(_mm256_castsi256_si128(sse_256), 97 _mm256_extractf128_si256(sse_256, 1)); 98 99 ssz_128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_256), 100 _mm256_extractf128_si256(ssz_256, 1)); 101 102 // Store the results. 103 _mm_storel_epi64((__m128i *)(&sse), sse_128); 104 105 _mm_storel_epi64((__m128i *)(ssz), ssz_128); 106 return sse; 107 } 108 109 int64_t vp9_block_error_fp_avx2(const tran_low_t *coeff, 110 const tran_low_t *dqcoeff, int block_size) { 111 int i; 112 const __m256i zero = _mm256_setzero_si256(); 113 __m256i sse_256 = zero; 114 __m256i sse_hi; 115 __m128i sse_128; 116 int64_t sse; 117 118 if (block_size == 16) { 119 // Load 16 elements for coeff and dqcoeff. 120 const __m256i _coeff = load_tran_low(coeff); 121 const __m256i _dqcoeff = load_tran_low(dqcoeff); 122 // dqcoeff - coeff 123 const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff); 124 // madd (dqcoeff - coeff) 125 const __m256i error_lo = _mm256_madd_epi16(diff, diff); 126 // Save the higher 64 bit of each 128 bit lane. 127 const __m256i error_hi = _mm256_srli_si256(error_lo, 8); 128 // Add the higher 64 bit to the low 64 bit. 129 const __m256i error = _mm256_add_epi32(error_lo, error_hi); 130 // Expand each double word in the lower 64 bits to quad word. 131 sse_256 = _mm256_unpacklo_epi32(error, zero); 132 } else { 133 for (i = 0; i < block_size; i += 16) { 134 // Load 16 elements for coeff and dqcoeff. 135 const __m256i _coeff = load_tran_low(coeff); 136 const __m256i _dqcoeff = load_tran_low(dqcoeff); 137 const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff); 138 const __m256i error = _mm256_madd_epi16(diff, diff); 139 // Expand each double word of madd (dqcoeff - coeff) to quad word. 140 const __m256i exp_error_lo = _mm256_unpacklo_epi32(error, zero); 141 const __m256i exp_error_hi = _mm256_unpackhi_epi32(error, zero); 142 // Add each quad word of madd (dqcoeff - coeff). 143 sse_256 = _mm256_add_epi64(sse_256, exp_error_lo); 144 sse_256 = _mm256_add_epi64(sse_256, exp_error_hi); 145 coeff += 16; 146 dqcoeff += 16; 147 } 148 } 149 // Save the higher 64 bit of each 128 bit lane. 150 sse_hi = _mm256_srli_si256(sse_256, 8); 151 // Add the higher 64 bit to the low 64 bit. 152 sse_256 = _mm256_add_epi64(sse_256, sse_hi); 153 154 // Add each 64 bit from each of the 128 bit lane of the 256 bit. 155 sse_128 = _mm_add_epi64(_mm256_castsi256_si128(sse_256), 156 _mm256_extractf128_si256(sse_256, 1)); 157 158 // Store the results. 159 _mm_storel_epi64((__m128i *)&sse, sse_128); 160 return sse; 161 } 162