Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <assert.h>
     12 #include <emmintrin.h>
     13 #include <xmmintrin.h>
     14 
     15 #include "./vp9_rtcd.h"
     16 #include "vpx/vpx_integer.h"
     17 #include "vpx_dsp/vpx_dsp_common.h"
     18 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
     19 
     20 void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     21                           int skip_block, const int16_t *round_ptr,
     22                           const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
     23                           tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
     24                           uint16_t *eob_ptr, const int16_t *scan,
     25                           const int16_t *iscan) {
     26   __m128i zero;
     27   __m128i thr;
     28   int16_t nzflag;
     29   __m128i eob;
     30   __m128i round, quant, dequant;
     31 
     32   (void)scan;
     33   (void)skip_block;
     34   assert(!skip_block);
     35 
     36   coeff_ptr += n_coeffs;
     37   iscan += n_coeffs;
     38   qcoeff_ptr += n_coeffs;
     39   dqcoeff_ptr += n_coeffs;
     40   n_coeffs = -n_coeffs;
     41   zero = _mm_setzero_si128();
     42 
     43   {
     44     __m128i coeff0, coeff1;
     45 
     46     // Setup global values
     47     {
     48       round = _mm_load_si128((const __m128i *)round_ptr);
     49       quant = _mm_load_si128((const __m128i *)quant_ptr);
     50       dequant = _mm_load_si128((const __m128i *)dequant_ptr);
     51     }
     52 
     53     {
     54       __m128i coeff0_sign, coeff1_sign;
     55       __m128i qcoeff0, qcoeff1;
     56       __m128i qtmp0, qtmp1;
     57       // Do DC and first 15 AC
     58       coeff0 = load_tran_low(coeff_ptr + n_coeffs);
     59       coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
     60 
     61       // Poor man's sign extract
     62       coeff0_sign = _mm_srai_epi16(coeff0, 15);
     63       coeff1_sign = _mm_srai_epi16(coeff1, 15);
     64       qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
     65       qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
     66       qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
     67       qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
     68 
     69       qcoeff0 = _mm_adds_epi16(qcoeff0, round);
     70       round = _mm_unpackhi_epi64(round, round);
     71       qcoeff1 = _mm_adds_epi16(qcoeff1, round);
     72       qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
     73       quant = _mm_unpackhi_epi64(quant, quant);
     74       qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
     75 
     76       // Reinsert signs
     77       qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
     78       qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
     79       qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
     80       qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
     81 
     82       store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
     83       store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
     84 
     85       coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
     86       dequant = _mm_unpackhi_epi64(dequant, dequant);
     87       coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
     88 
     89       store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
     90       store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
     91     }
     92 
     93     {
     94       // Scan for eob
     95       __m128i zero_coeff0, zero_coeff1;
     96       __m128i nzero_coeff0, nzero_coeff1;
     97       __m128i iscan0, iscan1;
     98       __m128i eob1;
     99       zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
    100       zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
    101       nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
    102       nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
    103       iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs));
    104       iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1);
    105       // Add one to convert from indices to counts
    106       iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
    107       iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
    108       eob = _mm_and_si128(iscan0, nzero_coeff0);
    109       eob1 = _mm_and_si128(iscan1, nzero_coeff1);
    110       eob = _mm_max_epi16(eob, eob1);
    111     }
    112     n_coeffs += 8 * 2;
    113   }
    114 
    115   thr = _mm_srai_epi16(dequant, 1);
    116 
    117   // AC only loop
    118   while (n_coeffs < 0) {
    119     __m128i coeff0, coeff1;
    120     {
    121       __m128i coeff0_sign, coeff1_sign;
    122       __m128i qcoeff0, qcoeff1;
    123       __m128i qtmp0, qtmp1;
    124 
    125       coeff0 = load_tran_low(coeff_ptr + n_coeffs);
    126       coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
    127 
    128       // Poor man's sign extract
    129       coeff0_sign = _mm_srai_epi16(coeff0, 15);
    130       coeff1_sign = _mm_srai_epi16(coeff1, 15);
    131       qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
    132       qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
    133       qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
    134       qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
    135 
    136       nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
    137                _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
    138 
    139       if (nzflag) {
    140         qcoeff0 = _mm_adds_epi16(qcoeff0, round);
    141         qcoeff1 = _mm_adds_epi16(qcoeff1, round);
    142         qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
    143         qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
    144 
    145         // Reinsert signs
    146         qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
    147         qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
    148         qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
    149         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
    150 
    151         store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
    152         store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
    153 
    154         coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
    155         coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
    156 
    157         store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
    158         store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
    159       } else {
    160         store_zero_tran_low(qcoeff_ptr + n_coeffs);
    161         store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
    162 
    163         store_zero_tran_low(dqcoeff_ptr + n_coeffs);
    164         store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
    165       }
    166     }
    167 
    168     if (nzflag) {
    169       // Scan for eob
    170       __m128i zero_coeff0, zero_coeff1;
    171       __m128i nzero_coeff0, nzero_coeff1;
    172       __m128i iscan0, iscan1;
    173       __m128i eob0, eob1;
    174       zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
    175       zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
    176       nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
    177       nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
    178       iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs));
    179       iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1);
    180       // Add one to convert from indices to counts
    181       iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
    182       iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
    183       eob0 = _mm_and_si128(iscan0, nzero_coeff0);
    184       eob1 = _mm_and_si128(iscan1, nzero_coeff1);
    185       eob0 = _mm_max_epi16(eob0, eob1);
    186       eob = _mm_max_epi16(eob, eob0);
    187     }
    188     n_coeffs += 8 * 2;
    189   }
    190 
    191   // Accumulate EOB
    192   {
    193     __m128i eob_shuffled;
    194     eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
    195     eob = _mm_max_epi16(eob, eob_shuffled);
    196     eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
    197     eob = _mm_max_epi16(eob, eob_shuffled);
    198     eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
    199     eob = _mm_max_epi16(eob, eob_shuffled);
    200     *eob_ptr = _mm_extract_epi16(eob, 1);
    201   }
    202 }
    203