1 /* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <arm_neon.h> 12 #include <assert.h> 13 #include <math.h> 14 15 #include "./vpx_config.h" 16 #include "vpx_mem/vpx_mem.h" 17 18 #include "vp9/common/vp9_quant_common.h" 19 #include "vp9/common/vp9_seg_common.h" 20 21 #include "vp9/encoder/vp9_encoder.h" 22 #include "vp9/encoder/vp9_quantize.h" 23 #include "vp9/encoder/vp9_rd.h" 24 25 #include "vpx_dsp/arm/idct_neon.h" 26 #include "vpx_dsp/arm/mem_neon.h" 27 #include "vpx_dsp/vpx_dsp_common.h" 28 29 void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count, 30 int skip_block, const int16_t *round_ptr, 31 const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, 32 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, 33 uint16_t *eob_ptr, const int16_t *scan, 34 const int16_t *iscan) { 35 // Quantization pass: All coefficients with index >= zero_flag are 36 // skippable. Note: zero_flag can be zero. 37 int i; 38 const int16x8_t v_zero = vdupq_n_s16(0); 39 const int16x8_t v_one = vdupq_n_s16(1); 40 int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1); 41 int16x8_t v_round = vmovq_n_s16(round_ptr[1]); 42 int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]); 43 int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]); 44 45 (void)scan; 46 (void)skip_block; 47 assert(!skip_block); 48 49 // adjust for dc 50 v_round = vsetq_lane_s16(round_ptr[0], v_round, 0); 51 v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0); 52 v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0); 53 // process dc and the first seven ac coeffs 54 { 55 const int16x8_t v_iscan = vld1q_s16(&iscan[0]); 56 const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr); 57 const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); 58 const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero); 59 const int32x4_t v_tmp_lo = 60 vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant)); 61 const int32x4_t v_tmp_hi = 62 vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant)); 63 const int16x8_t v_tmp2 = 64 vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16)); 65 const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero); 66 const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one); 67 const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1); 68 const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); 69 const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); 70 const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant); 71 v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan); 72 store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff); 73 store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff); 74 v_round = vmovq_n_s16(round_ptr[1]); 75 v_quant = vmovq_n_s16(quant_ptr[1]); 76 v_dequant = vmovq_n_s16(dequant_ptr[1]); 77 } 78 // now process the rest of the ac coeffs 79 for (i = 8; i < count; i += 8) { 80 const int16x8_t v_iscan = vld1q_s16(&iscan[i]); 81 const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr + i); 82 const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); 83 const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero); 84 const int32x4_t v_tmp_lo = 85 vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant)); 86 const int32x4_t v_tmp_hi = 87 vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant)); 88 const int16x8_t v_tmp2 = 89 vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16)); 90 const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero); 91 const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one); 92 const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1); 93 const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); 94 const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); 95 const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant); 96 v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan); 97 store_s16q_to_tran_low(qcoeff_ptr + i, v_qcoeff); 98 store_s16q_to_tran_low(dqcoeff_ptr + i, v_dqcoeff); 99 } 100 { 101 const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax_76543210), 102 vget_high_s16(v_eobmax_76543210)); 103 const int64x1_t v_eobmax_xx32 = 104 vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32); 105 const int16x4_t v_eobmax_tmp = 106 vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32)); 107 const int64x1_t v_eobmax_xxx3 = 108 vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16); 109 const int16x4_t v_eobmax_final = 110 vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3)); 111 112 *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0); 113 } 114 } 115 116 static INLINE int32x4_t extract_sign_bit(int32x4_t a) { 117 return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31)); 118 } 119 120 void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, 121 int skip_block, const int16_t *round_ptr, 122 const int16_t *quant_ptr, 123 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, 124 const int16_t *dequant_ptr, uint16_t *eob_ptr, 125 const int16_t *scan, const int16_t *iscan_ptr) { 126 const int16x8_t one = vdupq_n_s16(1); 127 const int16x8_t neg_one = vdupq_n_s16(-1); 128 129 // ROUND_POWER_OF_TWO(round_ptr[], 1) 130 const int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1); 131 const int16x8_t quant = vld1q_s16(quant_ptr); 132 const int16x4_t dequant = vld1_s16(dequant_ptr); 133 // dequant >> 2 is used similar to zbin as a threshold. 134 const int16x8_t dequant_thresh = vshrq_n_s16(vld1q_s16(dequant_ptr), 2); 135 136 // Process dc and the first seven ac coeffs. 137 const uint16x8_t iscan = 138 vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one)); 139 const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); 140 const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); 141 const int16x8_t coeff_abs = vabsq_s16(coeff); 142 const int16x8_t dequant_mask = 143 vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, dequant_thresh)); 144 145 int16x8_t qcoeff = vaddq_s16(coeff_abs, round); 146 int32x4_t dqcoeff_0, dqcoeff_1; 147 int16x8_t dqcoeff; 148 uint16x8_t eob_max; 149 (void)scan; 150 (void)count; 151 (void)skip_block; 152 assert(!skip_block); 153 154 // coeff * quant_ptr[]) >> 15 155 qcoeff = vqdmulhq_s16(qcoeff, quant); 156 157 // Restore sign. 158 qcoeff = veorq_s16(qcoeff, coeff_sign); 159 qcoeff = vsubq_s16(qcoeff, coeff_sign); 160 qcoeff = vandq_s16(qcoeff, dequant_mask); 161 162 // qcoeff * dequant[] / 2 163 dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), dequant); 164 dqcoeff_1 = vmull_n_s16(vget_high_s16(qcoeff), dequant_ptr[1]); 165 166 // Add 1 if negative to round towards zero because the C uses division. 167 dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0)); 168 dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1)); 169 170 dqcoeff = vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)); 171 172 eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), iscan); 173 174 store_s16q_to_tran_low(qcoeff_ptr, qcoeff); 175 store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff); 176 177 iscan_ptr += 8; 178 coeff_ptr += 8; 179 qcoeff_ptr += 8; 180 dqcoeff_ptr += 8; 181 182 { 183 int i; 184 const int16x8_t round = vrshrq_n_s16(vmovq_n_s16(round_ptr[1]), 1); 185 const int16x8_t quant = vmovq_n_s16(quant_ptr[1]); 186 const int16x8_t dequant_thresh = 187 vshrq_n_s16(vmovq_n_s16(dequant_ptr[1]), 2); 188 189 // Process the rest of the ac coeffs. 190 for (i = 8; i < 32 * 32; i += 8) { 191 const uint16x8_t iscan = 192 vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one)); 193 const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); 194 const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); 195 const int16x8_t coeff_abs = vabsq_s16(coeff); 196 const int16x8_t dequant_mask = 197 vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, dequant_thresh)); 198 199 int16x8_t qcoeff = vaddq_s16(coeff_abs, round); 200 int32x4_t dqcoeff_0, dqcoeff_1; 201 int16x8_t dqcoeff; 202 203 qcoeff = vqdmulhq_s16(qcoeff, quant); 204 qcoeff = veorq_s16(qcoeff, coeff_sign); 205 qcoeff = vsubq_s16(qcoeff, coeff_sign); 206 qcoeff = vandq_s16(qcoeff, dequant_mask); 207 208 dqcoeff_0 = vmull_n_s16(vget_low_s16(qcoeff), dequant_ptr[1]); 209 dqcoeff_1 = vmull_n_s16(vget_high_s16(qcoeff), dequant_ptr[1]); 210 211 dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0)); 212 dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1)); 213 214 dqcoeff = 215 vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)); 216 217 eob_max = 218 vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), iscan)); 219 220 store_s16q_to_tran_low(qcoeff_ptr, qcoeff); 221 store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff); 222 223 iscan_ptr += 8; 224 coeff_ptr += 8; 225 qcoeff_ptr += 8; 226 dqcoeff_ptr += 8; 227 } 228 229 { 230 const uint16x4_t eob_max_0 = 231 vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max)); 232 const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0); 233 const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); 234 vst1_lane_u16(eob_ptr, eob_max_2, 0); 235 } 236 } 237 } 238