Home | History | Annotate | Download | only in neon
      1 /*
      2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <arm_neon.h>
     12 #include <assert.h>
     13 #include <math.h>
     14 
     15 #include "./vpx_config.h"
     16 #include "vpx_mem/vpx_mem.h"
     17 
     18 #include "vp9/common/vp9_quant_common.h"
     19 #include "vp9/common/vp9_seg_common.h"
     20 
     21 #include "vp9/encoder/vp9_encoder.h"
     22 #include "vp9/encoder/vp9_quantize.h"
     23 #include "vp9/encoder/vp9_rd.h"
     24 
     25 #include "vpx_dsp/arm/idct_neon.h"
     26 #include "vpx_dsp/arm/mem_neon.h"
     27 #include "vpx_dsp/vpx_dsp_common.h"
     28 
     29 void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
     30                           int skip_block, const int16_t *round_ptr,
     31                           const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
     32                           tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
     33                           uint16_t *eob_ptr, const int16_t *scan,
     34                           const int16_t *iscan) {
     35   // Quantization pass: All coefficients with index >= zero_flag are
     36   // skippable. Note: zero_flag can be zero.
     37   int i;
     38   const int16x8_t v_zero = vdupq_n_s16(0);
     39   const int16x8_t v_one = vdupq_n_s16(1);
     40   int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
     41   int16x8_t v_round = vmovq_n_s16(round_ptr[1]);
     42   int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]);
     43   int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]);
     44 
     45   (void)scan;
     46   (void)skip_block;
     47   assert(!skip_block);
     48 
     49   // adjust for dc
     50   v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
     51   v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
     52   v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
     53   // process dc and the first seven ac coeffs
     54   {
     55     const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
     56     const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
     57     const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
     58     const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
     59     const int32x4_t v_tmp_lo =
     60         vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
     61     const int32x4_t v_tmp_hi =
     62         vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
     63     const int16x8_t v_tmp2 =
     64         vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
     65     const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
     66     const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
     67     const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
     68     const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
     69     const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
     70     const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
     71     v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
     72     store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
     73     store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff);
     74     v_round = vmovq_n_s16(round_ptr[1]);
     75     v_quant = vmovq_n_s16(quant_ptr[1]);
     76     v_dequant = vmovq_n_s16(dequant_ptr[1]);
     77   }
     78   // now process the rest of the ac coeffs
     79   for (i = 8; i < count; i += 8) {
     80     const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
     81     const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr + i);
     82     const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
     83     const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
     84     const int32x4_t v_tmp_lo =
     85         vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
     86     const int32x4_t v_tmp_hi =
     87         vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
     88     const int16x8_t v_tmp2 =
     89         vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
     90     const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
     91     const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
     92     const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
     93     const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
     94     const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
     95     const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
     96     v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
     97     store_s16q_to_tran_low(qcoeff_ptr + i, v_qcoeff);
     98     store_s16q_to_tran_low(dqcoeff_ptr + i, v_dqcoeff);
     99   }
    100   {
    101     const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax_76543210),
    102                                              vget_high_s16(v_eobmax_76543210));
    103     const int64x1_t v_eobmax_xx32 =
    104         vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
    105     const int16x4_t v_eobmax_tmp =
    106         vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
    107     const int64x1_t v_eobmax_xxx3 =
    108         vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
    109     const int16x4_t v_eobmax_final =
    110         vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
    111 
    112     *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
    113   }
    114 }
    115 
    116 static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
    117   return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
    118 }
    119 
    120 void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
    121                                 int skip_block, const int16_t *round_ptr,
    122                                 const int16_t *quant_ptr,
    123                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
    124                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
    125                                 const int16_t *scan, const int16_t *iscan_ptr) {
    126   const int16x8_t one = vdupq_n_s16(1);
    127   const int16x8_t neg_one = vdupq_n_s16(-1);
    128 
    129   // ROUND_POWER_OF_TWO(round_ptr[], 1)
    130   const int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
    131   const int16x8_t quant = vld1q_s16(quant_ptr);
    132   const int16x4_t dequant = vld1_s16(dequant_ptr);
    133   // dequant >> 2 is used similar to zbin as a threshold.
    134   const int16x8_t dequant_thresh = vshrq_n_s16(vld1q_s16(dequant_ptr), 2);
    135 
    136   // Process dc and the first seven ac coeffs.
    137   const uint16x8_t iscan =
    138       vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
    139   const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
    140   const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
    141   const int16x8_t coeff_abs = vabsq_s16(coeff);
    142   const int16x8_t dequant_mask =
    143       vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, dequant_thresh));
    144 
    145   int16x8_t qcoeff = vaddq_s16(coeff_abs, round);
    146   int32x4_t dqcoeff_0, dqcoeff_1;
    147   int16x8_t dqcoeff;
    148   uint16x8_t eob_max;
    149   (void)scan;
    150   (void)count;
    151   (void)skip_block;
    152   assert(!skip_block);
    153 
    154   // coeff * quant_ptr[]) >> 15
    155   qcoeff = vqdmulhq_s16(qcoeff, quant);
    156 
    157   // Restore sign.
    158   qcoeff = veorq_s16(qcoeff, coeff_sign);
    159   qcoeff = vsubq_s16(qcoeff, coeff_sign);
    160   qcoeff = vandq_s16(qcoeff, dequant_mask);
    161 
    162   // qcoeff * dequant[] / 2
    163   dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), dequant);
    164   dqcoeff_1 = vmull_n_s16(vget_high_s16(qcoeff), dequant_ptr[1]);
    165 
    166   // Add 1 if negative to round towards zero because the C uses division.
    167   dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
    168   dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
    169 
    170   dqcoeff = vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1));
    171 
    172   eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), iscan);
    173 
    174   store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
    175   store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff);
    176 
    177   iscan_ptr += 8;
    178   coeff_ptr += 8;
    179   qcoeff_ptr += 8;
    180   dqcoeff_ptr += 8;
    181 
    182   {
    183     int i;
    184     const int16x8_t round = vrshrq_n_s16(vmovq_n_s16(round_ptr[1]), 1);
    185     const int16x8_t quant = vmovq_n_s16(quant_ptr[1]);
    186     const int16x8_t dequant_thresh =
    187         vshrq_n_s16(vmovq_n_s16(dequant_ptr[1]), 2);
    188 
    189     // Process the rest of the ac coeffs.
    190     for (i = 8; i < 32 * 32; i += 8) {
    191       const uint16x8_t iscan =
    192           vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
    193       const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
    194       const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
    195       const int16x8_t coeff_abs = vabsq_s16(coeff);
    196       const int16x8_t dequant_mask =
    197           vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, dequant_thresh));
    198 
    199       int16x8_t qcoeff = vaddq_s16(coeff_abs, round);
    200       int32x4_t dqcoeff_0, dqcoeff_1;
    201       int16x8_t dqcoeff;
    202 
    203       qcoeff = vqdmulhq_s16(qcoeff, quant);
    204       qcoeff = veorq_s16(qcoeff, coeff_sign);
    205       qcoeff = vsubq_s16(qcoeff, coeff_sign);
    206       qcoeff = vandq_s16(qcoeff, dequant_mask);
    207 
    208       dqcoeff_0 = vmull_n_s16(vget_low_s16(qcoeff), dequant_ptr[1]);
    209       dqcoeff_1 = vmull_n_s16(vget_high_s16(qcoeff), dequant_ptr[1]);
    210 
    211       dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
    212       dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
    213 
    214       dqcoeff =
    215           vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1));
    216 
    217       eob_max =
    218           vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), iscan));
    219 
    220       store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
    221       store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff);
    222 
    223       iscan_ptr += 8;
    224       coeff_ptr += 8;
    225       qcoeff_ptr += 8;
    226       dqcoeff_ptr += 8;
    227     }
    228 
    229     {
    230       const uint16x4_t eob_max_0 =
    231           vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
    232       const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
    233       const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
    234       vst1_lane_u16(eob_ptr, eob_max_2, 0);
    235     }
    236   }
    237 }
    238