Home | History | Annotate | Download | only in x86
      1 /*
      2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
      3  *
      4  * This source code is subject to the terms of the BSD 2 Clause License and
      5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6  * was not distributed with this source code in the LICENSE file, you can
      7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8  * Media Patent License 1.0 was not distributed with this source code in the
      9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10  */
     11 
     12 #include <smmintrin.h>
     13 #include <stdint.h>
     14 
     15 #include "config/av1_rtcd.h"
     16 
     17 #include "aom_dsp/aom_dsp_common.h"
     18 #include "aom_dsp/x86/synonyms.h"
     19 
     20 // Coefficient quantization phase 1
     21 // param[0-2] : rounding/quan/dequan constants
     22 static INLINE void quantize_coeff_phase1(__m128i *coeff, const __m128i *param,
     23                                          const int shift, const int scale,
     24                                          __m128i *qcoeff, __m128i *dquan,
     25                                          __m128i *sign) {
     26   const __m128i zero = _mm_setzero_si128();
     27   const __m128i one = _mm_set1_epi32(1);
     28 
     29   *sign = _mm_cmplt_epi32(*coeff, zero);
     30   *sign = _mm_or_si128(*sign, one);
     31   *coeff = _mm_abs_epi32(*coeff);
     32 
     33   qcoeff[0] = _mm_add_epi32(*coeff, param[0]);
     34   qcoeff[1] = _mm_unpackhi_epi32(qcoeff[0], zero);
     35   qcoeff[0] = _mm_unpacklo_epi32(qcoeff[0], zero);
     36 
     37   qcoeff[0] = _mm_mul_epi32(qcoeff[0], param[1]);
     38   qcoeff[0] = _mm_srli_epi64(qcoeff[0], shift);
     39   dquan[0] = _mm_mul_epi32(qcoeff[0], param[2]);
     40   dquan[0] = _mm_srli_epi64(dquan[0], scale);
     41   const __m128i abs_s = _mm_slli_epi32(*coeff, 1 + scale);
     42   qcoeff[2] = _mm_cmplt_epi32(abs_s, param[3]);
     43 }
     44 
     45 // Coefficient quantization phase 2
     46 static INLINE void quantize_coeff_phase2(__m128i *qcoeff, __m128i *dquan,
     47                                          const __m128i *sign,
     48                                          const __m128i *param, const int shift,
     49                                          const int scale, tran_low_t *qAddr,
     50                                          tran_low_t *dqAddr) {
     51   __m128i mask0L = _mm_set_epi32(-1, -1, 0, 0);
     52   __m128i mask0H = _mm_set_epi32(0, 0, -1, -1);
     53 
     54   qcoeff[1] = _mm_mul_epi32(qcoeff[1], param[1]);
     55   qcoeff[1] = _mm_srli_epi64(qcoeff[1], shift);
     56   dquan[1] = _mm_mul_epi32(qcoeff[1], param[2]);
     57   dquan[1] = _mm_srli_epi64(dquan[1], scale);
     58 
     59   // combine L&H
     60   qcoeff[0] = _mm_shuffle_epi32(qcoeff[0], 0xd8);
     61   qcoeff[1] = _mm_shuffle_epi32(qcoeff[1], 0x8d);
     62 
     63   qcoeff[0] = _mm_and_si128(qcoeff[0], mask0H);
     64   qcoeff[1] = _mm_and_si128(qcoeff[1], mask0L);
     65 
     66   dquan[0] = _mm_shuffle_epi32(dquan[0], 0xd8);
     67   dquan[1] = _mm_shuffle_epi32(dquan[1], 0x8d);
     68 
     69   dquan[0] = _mm_and_si128(dquan[0], mask0H);
     70   dquan[1] = _mm_and_si128(dquan[1], mask0L);
     71 
     72   qcoeff[0] = _mm_or_si128(qcoeff[0], qcoeff[1]);
     73   dquan[0] = _mm_or_si128(dquan[0], dquan[1]);
     74 
     75   qcoeff[0] = _mm_sign_epi32(qcoeff[0], *sign);
     76   dquan[0] = _mm_sign_epi32(dquan[0], *sign);
     77   qcoeff[0] = _mm_andnot_si128(qcoeff[2], qcoeff[0]);
     78   dquan[0] = _mm_andnot_si128(qcoeff[2], dquan[0]);
     79   _mm_storeu_si128((__m128i *)qAddr, qcoeff[0]);
     80   _mm_storeu_si128((__m128i *)dqAddr, dquan[0]);
     81 }
     82 
     83 static INLINE void find_eob(tran_low_t *qcoeff_ptr, const int16_t *iscan,
     84                             __m128i *eob) {
     85   const __m128i zero = _mm_setzero_si128();
     86   __m128i mask, iscanIdx;
     87   const __m128i q0 = _mm_loadu_si128((__m128i const *)qcoeff_ptr);
     88   const __m128i q1 = _mm_loadu_si128((__m128i const *)(qcoeff_ptr + 4));
     89   __m128i nz_flag0 = _mm_cmpeq_epi32(q0, zero);
     90   __m128i nz_flag1 = _mm_cmpeq_epi32(q1, zero);
     91 
     92   nz_flag0 = _mm_cmpeq_epi32(nz_flag0, zero);
     93   nz_flag1 = _mm_cmpeq_epi32(nz_flag1, zero);
     94 
     95   mask = _mm_packs_epi32(nz_flag0, nz_flag1);
     96   iscanIdx = _mm_loadu_si128((__m128i const *)iscan);
     97   iscanIdx = _mm_sub_epi16(iscanIdx, mask);
     98   iscanIdx = _mm_and_si128(iscanIdx, mask);
     99   *eob = _mm_max_epi16(*eob, iscanIdx);
    100 }
    101 
    102 static INLINE uint16_t get_accumulated_eob(__m128i *eob) {
    103   __m128i eob_shuffled;
    104   uint16_t eobValue;
    105   eob_shuffled = _mm_shuffle_epi32(*eob, 0xe);
    106   *eob = _mm_max_epi16(*eob, eob_shuffled);
    107   eob_shuffled = _mm_shufflelo_epi16(*eob, 0xe);
    108   *eob = _mm_max_epi16(*eob, eob_shuffled);
    109   eob_shuffled = _mm_shufflelo_epi16(*eob, 0x1);
    110   *eob = _mm_max_epi16(*eob, eob_shuffled);
    111   eobValue = _mm_extract_epi16(*eob, 0);
    112   return eobValue;
    113 }
    114 
    115 void av1_highbd_quantize_fp_sse4_1(
    116     const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr,
    117     const int16_t *round_ptr, const int16_t *quant_ptr,
    118     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
    119     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
    120     const int16_t *scan, const int16_t *iscan, int log_scale) {
    121   __m128i coeff[2], qcoeff[3], dequant[2], qparam[4], coeff_sign;
    122   __m128i eob = _mm_setzero_si128();
    123   const tran_low_t *src = coeff_ptr;
    124   tran_low_t *quanAddr = qcoeff_ptr;
    125   tran_low_t *dquanAddr = dqcoeff_ptr;
    126   const int shift = 16 - log_scale;
    127   const int coeff_stride = 4;
    128   const int quan_stride = coeff_stride;
    129   (void)zbin_ptr;
    130   (void)quant_shift_ptr;
    131   (void)scan;
    132 
    133   memset(quanAddr, 0, count * sizeof(quanAddr[0]));
    134   memset(dquanAddr, 0, count * sizeof(dquanAddr[0]));
    135 
    136   coeff[0] = _mm_loadu_si128((__m128i const *)src);
    137   const int round1 = ROUND_POWER_OF_TWO(round_ptr[1], log_scale);
    138   const int round0 = ROUND_POWER_OF_TWO(round_ptr[0], log_scale);
    139 
    140   qparam[0] = _mm_set_epi32(round1, round1, round1, round0);
    141   qparam[1] = xx_set_64_from_32i(quant_ptr[1], quant_ptr[0]);
    142   qparam[2] = xx_set_64_from_32i(dequant_ptr[1], dequant_ptr[0]);
    143   qparam[3] = _mm_set_epi32(dequant_ptr[1], dequant_ptr[1], dequant_ptr[1],
    144                             dequant_ptr[0]);
    145 
    146   // DC and first 3 AC
    147   quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,
    148                         &coeff_sign);
    149 
    150   // update round/quan/dquan for AC
    151   qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]);
    152   qparam[1] = xx_set1_64_from_32i(quant_ptr[1]);
    153   qparam[2] = xx_set1_64_from_32i(dequant_ptr[1]);
    154   qparam[3] = _mm_set1_epi32(dequant_ptr[1]);
    155   quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale,
    156                         quanAddr, dquanAddr);
    157 
    158   // next 4 AC
    159   coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
    160   quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant,
    161                         &coeff_sign);
    162   quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale,
    163                         quanAddr + quan_stride, dquanAddr + quan_stride);
    164 
    165   find_eob(quanAddr, iscan, &eob);
    166 
    167   count -= 8;
    168 
    169   // loop for the rest of AC
    170   while (count > 0) {
    171     src += coeff_stride << 1;
    172     quanAddr += quan_stride << 1;
    173     dquanAddr += quan_stride << 1;
    174     iscan += quan_stride << 1;
    175 
    176     coeff[0] = _mm_loadu_si128((__m128i const *)src);
    177     coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
    178 
    179     quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,
    180                           &coeff_sign);
    181     quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
    182                           log_scale, quanAddr, dquanAddr);
    183 
    184     quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant,
    185                           &coeff_sign);
    186     quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
    187                           log_scale, quanAddr + quan_stride,
    188                           dquanAddr + quan_stride);
    189 
    190     find_eob(quanAddr, iscan, &eob);
    191 
    192     count -= 8;
    193   }
    194   *eob_ptr = get_accumulated_eob(&eob);
    195 }
    196