1 /* 2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <immintrin.h> 12 13 #include "./vpx_dsp_rtcd.h" 14 #include "vpx/vpx_integer.h" 15 #include "vpx_dsp/x86/bitdepth_conversion_avx2.h" 16 #include "vpx_ports/mem.h" 17 18 static void hadamard_col8x2_avx2(__m256i *in, int iter) { 19 __m256i a0 = in[0]; 20 __m256i a1 = in[1]; 21 __m256i a2 = in[2]; 22 __m256i a3 = in[3]; 23 __m256i a4 = in[4]; 24 __m256i a5 = in[5]; 25 __m256i a6 = in[6]; 26 __m256i a7 = in[7]; 27 28 __m256i b0 = _mm256_add_epi16(a0, a1); 29 __m256i b1 = _mm256_sub_epi16(a0, a1); 30 __m256i b2 = _mm256_add_epi16(a2, a3); 31 __m256i b3 = _mm256_sub_epi16(a2, a3); 32 __m256i b4 = _mm256_add_epi16(a4, a5); 33 __m256i b5 = _mm256_sub_epi16(a4, a5); 34 __m256i b6 = _mm256_add_epi16(a6, a7); 35 __m256i b7 = _mm256_sub_epi16(a6, a7); 36 37 a0 = _mm256_add_epi16(b0, b2); 38 a1 = _mm256_add_epi16(b1, b3); 39 a2 = _mm256_sub_epi16(b0, b2); 40 a3 = _mm256_sub_epi16(b1, b3); 41 a4 = _mm256_add_epi16(b4, b6); 42 a5 = _mm256_add_epi16(b5, b7); 43 a6 = _mm256_sub_epi16(b4, b6); 44 a7 = _mm256_sub_epi16(b5, b7); 45 46 if (iter == 0) { 47 b0 = _mm256_add_epi16(a0, a4); 48 b7 = _mm256_add_epi16(a1, a5); 49 b3 = _mm256_add_epi16(a2, a6); 50 b4 = _mm256_add_epi16(a3, a7); 51 b2 = _mm256_sub_epi16(a0, a4); 52 b6 = _mm256_sub_epi16(a1, a5); 53 b1 = _mm256_sub_epi16(a2, a6); 54 b5 = _mm256_sub_epi16(a3, a7); 55 56 a0 = _mm256_unpacklo_epi16(b0, b1); 57 a1 = _mm256_unpacklo_epi16(b2, b3); 58 a2 = _mm256_unpackhi_epi16(b0, b1); 59 a3 = _mm256_unpackhi_epi16(b2, b3); 60 a4 = _mm256_unpacklo_epi16(b4, b5); 61 a5 = _mm256_unpacklo_epi16(b6, b7); 62 a6 = _mm256_unpackhi_epi16(b4, b5); 63 a7 = _mm256_unpackhi_epi16(b6, b7); 64 65 b0 = _mm256_unpacklo_epi32(a0, a1); 66 b1 = _mm256_unpacklo_epi32(a4, a5); 67 b2 = _mm256_unpackhi_epi32(a0, a1); 68 b3 = _mm256_unpackhi_epi32(a4, a5); 69 b4 = _mm256_unpacklo_epi32(a2, a3); 70 b5 = _mm256_unpacklo_epi32(a6, a7); 71 b6 = _mm256_unpackhi_epi32(a2, a3); 72 b7 = _mm256_unpackhi_epi32(a6, a7); 73 74 in[0] = _mm256_unpacklo_epi64(b0, b1); 75 in[1] = _mm256_unpackhi_epi64(b0, b1); 76 in[2] = _mm256_unpacklo_epi64(b2, b3); 77 in[3] = _mm256_unpackhi_epi64(b2, b3); 78 in[4] = _mm256_unpacklo_epi64(b4, b5); 79 in[5] = _mm256_unpackhi_epi64(b4, b5); 80 in[6] = _mm256_unpacklo_epi64(b6, b7); 81 in[7] = _mm256_unpackhi_epi64(b6, b7); 82 } else { 83 in[0] = _mm256_add_epi16(a0, a4); 84 in[7] = _mm256_add_epi16(a1, a5); 85 in[3] = _mm256_add_epi16(a2, a6); 86 in[4] = _mm256_add_epi16(a3, a7); 87 in[2] = _mm256_sub_epi16(a0, a4); 88 in[6] = _mm256_sub_epi16(a1, a5); 89 in[1] = _mm256_sub_epi16(a2, a6); 90 in[5] = _mm256_sub_epi16(a3, a7); 91 } 92 } 93 94 static void hadamard_8x8x2_avx2(int16_t const *src_diff, ptrdiff_t src_stride, 95 int16_t *coeff) { 96 __m256i src[8]; 97 src[0] = _mm256_loadu_si256((const __m256i *)src_diff); 98 src[1] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); 99 src[2] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); 100 src[3] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); 101 src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); 102 src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); 103 src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); 104 src[7] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); 105 106 hadamard_col8x2_avx2(src, 0); 107 hadamard_col8x2_avx2(src, 1); 108 109 _mm256_storeu_si256((__m256i *)coeff, 110 _mm256_permute2x128_si256(src[0], src[1], 0x20)); 111 coeff += 16; 112 _mm256_storeu_si256((__m256i *)coeff, 113 _mm256_permute2x128_si256(src[2], src[3], 0x20)); 114 coeff += 16; 115 _mm256_storeu_si256((__m256i *)coeff, 116 _mm256_permute2x128_si256(src[4], src[5], 0x20)); 117 coeff += 16; 118 _mm256_storeu_si256((__m256i *)coeff, 119 _mm256_permute2x128_si256(src[6], src[7], 0x20)); 120 coeff += 16; 121 _mm256_storeu_si256((__m256i *)coeff, 122 _mm256_permute2x128_si256(src[0], src[1], 0x31)); 123 coeff += 16; 124 _mm256_storeu_si256((__m256i *)coeff, 125 _mm256_permute2x128_si256(src[2], src[3], 0x31)); 126 coeff += 16; 127 _mm256_storeu_si256((__m256i *)coeff, 128 _mm256_permute2x128_si256(src[4], src[5], 0x31)); 129 coeff += 16; 130 _mm256_storeu_si256((__m256i *)coeff, 131 _mm256_permute2x128_si256(src[6], src[7], 0x31)); 132 } 133 134 void vpx_hadamard_16x16_avx2(int16_t const *src_diff, ptrdiff_t src_stride, 135 tran_low_t *coeff) { 136 int idx; 137 #if CONFIG_VP9_HIGHBITDEPTH 138 DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]); 139 int16_t *t_coeff = temp_coeff; 140 #else 141 int16_t *t_coeff = coeff; 142 #endif 143 144 for (idx = 0; idx < 2; ++idx) { 145 int16_t const *src_ptr = src_diff + idx * 8 * src_stride; 146 hadamard_8x8x2_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2)); 147 } 148 149 for (idx = 0; idx < 64; idx += 16) { 150 const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); 151 const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64)); 152 const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128)); 153 const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192)); 154 155 __m256i b0 = _mm256_add_epi16(coeff0, coeff1); 156 __m256i b1 = _mm256_sub_epi16(coeff0, coeff1); 157 __m256i b2 = _mm256_add_epi16(coeff2, coeff3); 158 __m256i b3 = _mm256_sub_epi16(coeff2, coeff3); 159 160 b0 = _mm256_srai_epi16(b0, 1); 161 b1 = _mm256_srai_epi16(b1, 1); 162 b2 = _mm256_srai_epi16(b2, 1); 163 b3 = _mm256_srai_epi16(b3, 1); 164 165 store_tran_low(_mm256_add_epi16(b0, b2), coeff); 166 store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64); 167 store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128); 168 store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192); 169 170 coeff += 16; 171 t_coeff += 16; 172 } 173 } 174 175 int vpx_satd_avx2(const tran_low_t *coeff, int length) { 176 const __m256i one = _mm256_set1_epi16(1); 177 __m256i accum = _mm256_setzero_si256(); 178 int i; 179 180 for (i = 0; i < length; i += 16) { 181 const __m256i src_line = load_tran_low(coeff); 182 const __m256i abs = _mm256_abs_epi16(src_line); 183 const __m256i sum = _mm256_madd_epi16(abs, one); 184 accum = _mm256_add_epi32(accum, sum); 185 coeff += 16; 186 } 187 188 { // 32 bit horizontal add 189 const __m256i a = _mm256_srli_si256(accum, 8); 190 const __m256i b = _mm256_add_epi32(accum, a); 191 const __m256i c = _mm256_srli_epi64(b, 32); 192 const __m256i d = _mm256_add_epi32(b, c); 193 const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d), 194 _mm256_extractf128_si256(d, 1)); 195 return _mm_cvtsi128_si32(accum_128); 196 } 197 } 198