1 /* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <emmintrin.h> 12 #include <xmmintrin.h> 13 14 #include "./vp9_rtcd.h" 15 #include "vpx/vpx_integer.h" 16 17 void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs, 18 int skip_block, const int16_t* zbin_ptr, 19 const int16_t* round_ptr, const int16_t* quant_ptr, 20 const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr, 21 int16_t* dqcoeff_ptr, const int16_t* dequant_ptr, 22 uint16_t* eob_ptr, 23 const int16_t* scan_ptr, 24 const int16_t* iscan_ptr) { 25 __m128i zero; 26 __m128i thr; 27 int16_t nzflag; 28 (void)scan_ptr; 29 (void)zbin_ptr; 30 (void)quant_shift_ptr; 31 32 coeff_ptr += n_coeffs; 33 iscan_ptr += n_coeffs; 34 qcoeff_ptr += n_coeffs; 35 dqcoeff_ptr += n_coeffs; 36 n_coeffs = -n_coeffs; 37 zero = _mm_setzero_si128(); 38 39 if (!skip_block) { 40 __m128i eob; 41 __m128i round, quant, dequant; 42 { 43 __m128i coeff0, coeff1; 44 45 // Setup global values 46 { 47 round = _mm_load_si128((const __m128i*)round_ptr); 48 quant = _mm_load_si128((const __m128i*)quant_ptr); 49 dequant = _mm_load_si128((const __m128i*)dequant_ptr); 50 } 51 52 { 53 __m128i coeff0_sign, coeff1_sign; 54 __m128i qcoeff0, qcoeff1; 55 __m128i qtmp0, qtmp1; 56 // Do DC and first 15 AC 57 coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs)); 58 coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1); 59 60 // Poor man's sign extract 61 coeff0_sign = _mm_srai_epi16(coeff0, 15); 62 coeff1_sign = _mm_srai_epi16(coeff1, 15); 63 qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); 64 qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); 65 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 66 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 67 68 qcoeff0 = _mm_adds_epi16(qcoeff0, round); 69 round = _mm_unpackhi_epi64(round, round); 70 qcoeff1 = _mm_adds_epi16(qcoeff1, round); 71 qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); 72 quant = _mm_unpackhi_epi64(quant, quant); 73 qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); 74 75 // Reinsert signs 76 qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); 77 qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); 78 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 79 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 80 81 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0); 82 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); 83 84 coeff0 = _mm_mullo_epi16(qcoeff0, dequant); 85 dequant = _mm_unpackhi_epi64(dequant, dequant); 86 coeff1 = _mm_mullo_epi16(qcoeff1, dequant); 87 88 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0); 89 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1); 90 } 91 92 { 93 // Scan for eob 94 __m128i zero_coeff0, zero_coeff1; 95 __m128i nzero_coeff0, nzero_coeff1; 96 __m128i iscan0, iscan1; 97 __m128i eob1; 98 zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); 99 zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); 100 nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); 101 nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); 102 iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs)); 103 iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1); 104 // Add one to convert from indices to counts 105 iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); 106 iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); 107 eob = _mm_and_si128(iscan0, nzero_coeff0); 108 eob1 = _mm_and_si128(iscan1, nzero_coeff1); 109 eob = _mm_max_epi16(eob, eob1); 110 } 111 n_coeffs += 8 * 2; 112 } 113 114 thr = _mm_srai_epi16(dequant, 1); 115 116 // AC only loop 117 while (n_coeffs < 0) { 118 __m128i coeff0, coeff1; 119 { 120 __m128i coeff0_sign, coeff1_sign; 121 __m128i qcoeff0, qcoeff1; 122 __m128i qtmp0, qtmp1; 123 124 coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs)); 125 coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1); 126 127 // Poor man's sign extract 128 coeff0_sign = _mm_srai_epi16(coeff0, 15); 129 coeff1_sign = _mm_srai_epi16(coeff1, 15); 130 qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); 131 qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); 132 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 133 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 134 135 nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | 136 _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); 137 138 if (nzflag) { 139 qcoeff0 = _mm_adds_epi16(qcoeff0, round); 140 qcoeff1 = _mm_adds_epi16(qcoeff1, round); 141 qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); 142 qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); 143 144 // Reinsert signs 145 qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); 146 qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); 147 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 148 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 149 150 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0); 151 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); 152 153 coeff0 = _mm_mullo_epi16(qcoeff0, dequant); 154 coeff1 = _mm_mullo_epi16(qcoeff1, dequant); 155 156 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0); 157 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1); 158 } else { 159 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero); 160 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero); 161 162 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero); 163 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero); 164 } 165 } 166 167 if (nzflag) { 168 // Scan for eob 169 __m128i zero_coeff0, zero_coeff1; 170 __m128i nzero_coeff0, nzero_coeff1; 171 __m128i iscan0, iscan1; 172 __m128i eob0, eob1; 173 zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); 174 zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); 175 nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); 176 nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); 177 iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs)); 178 iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1); 179 // Add one to convert from indices to counts 180 iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); 181 iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); 182 eob0 = _mm_and_si128(iscan0, nzero_coeff0); 183 eob1 = _mm_and_si128(iscan1, nzero_coeff1); 184 eob0 = _mm_max_epi16(eob0, eob1); 185 eob = _mm_max_epi16(eob, eob0); 186 } 187 n_coeffs += 8 * 2; 188 } 189 190 // Accumulate EOB 191 { 192 __m128i eob_shuffled; 193 eob_shuffled = _mm_shuffle_epi32(eob, 0xe); 194 eob = _mm_max_epi16(eob, eob_shuffled); 195 eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); 196 eob = _mm_max_epi16(eob, eob_shuffled); 197 eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); 198 eob = _mm_max_epi16(eob, eob_shuffled); 199 *eob_ptr = _mm_extract_epi16(eob, 1); 200 } 201 } else { 202 do { 203 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero); 204 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero); 205 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero); 206 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero); 207 n_coeffs += 8 * 2; 208 } while (n_coeffs < 0); 209 *eob_ptr = 0; 210 } 211 } 212