Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <emmintrin.h>
     12 #include <xmmintrin.h>
     13 
     14 #include "./vpx_dsp_rtcd.h"
     15 #include "vpx/vpx_integer.h"
     16 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
     17 
     18 void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     19                          int skip_block, const int16_t *zbin_ptr,
     20                          const int16_t *round_ptr, const int16_t *quant_ptr,
     21                          const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     22                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
     23                          uint16_t *eob_ptr, const int16_t *scan_ptr,
     24                          const int16_t *iscan_ptr) {
     25   __m128i zero;
     26   (void)scan_ptr;
     27 
     28   coeff_ptr += n_coeffs;
     29   iscan_ptr += n_coeffs;
     30   qcoeff_ptr += n_coeffs;
     31   dqcoeff_ptr += n_coeffs;
     32   n_coeffs = -n_coeffs;
     33   zero = _mm_setzero_si128();
     34   if (!skip_block) {
     35     __m128i eob;
     36     __m128i zbin;
     37     __m128i round, quant, dequant, shift;
     38     {
     39       __m128i coeff0, coeff1;
     40 
     41       // Setup global values
     42       {
     43         __m128i pw_1;
     44         zbin = _mm_load_si128((const __m128i *)zbin_ptr);
     45         round = _mm_load_si128((const __m128i *)round_ptr);
     46         quant = _mm_load_si128((const __m128i *)quant_ptr);
     47         pw_1 = _mm_set1_epi16(1);
     48         zbin = _mm_sub_epi16(zbin, pw_1);
     49         dequant = _mm_load_si128((const __m128i *)dequant_ptr);
     50         shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
     51       }
     52 
     53       {
     54         __m128i coeff0_sign, coeff1_sign;
     55         __m128i qcoeff0, qcoeff1;
     56         __m128i qtmp0, qtmp1;
     57         __m128i cmp_mask0, cmp_mask1;
     58         // Do DC and first 15 AC
     59         coeff0 = load_tran_low(coeff_ptr + n_coeffs);
     60         coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
     61 
     62         // Poor man's sign extract
     63         coeff0_sign = _mm_srai_epi16(coeff0, 15);
     64         coeff1_sign = _mm_srai_epi16(coeff1, 15);
     65         qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
     66         qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
     67         qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
     68         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
     69 
     70         cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
     71         zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
     72         cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
     73         qcoeff0 = _mm_adds_epi16(qcoeff0, round);
     74         round = _mm_unpackhi_epi64(round, round);
     75         qcoeff1 = _mm_adds_epi16(qcoeff1, round);
     76         qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
     77         quant = _mm_unpackhi_epi64(quant, quant);
     78         qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
     79         qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
     80         qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
     81         qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
     82         shift = _mm_unpackhi_epi64(shift, shift);
     83         qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
     84 
     85         // Reinsert signs
     86         qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
     87         qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
     88         qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
     89         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
     90 
     91         // Mask out zbin threshold coeffs
     92         qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
     93         qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
     94 
     95         store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
     96         store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
     97 
     98         coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
     99         dequant = _mm_unpackhi_epi64(dequant, dequant);
    100         coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
    101 
    102         store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
    103         store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
    104       }
    105 
    106       {
    107         // Scan for eob
    108         __m128i zero_coeff0, zero_coeff1;
    109         __m128i nzero_coeff0, nzero_coeff1;
    110         __m128i iscan0, iscan1;
    111         __m128i eob1;
    112         zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
    113         zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
    114         nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
    115         nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
    116         iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
    117         iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
    118         // Add one to convert from indices to counts
    119         iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
    120         iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
    121         eob = _mm_and_si128(iscan0, nzero_coeff0);
    122         eob1 = _mm_and_si128(iscan1, nzero_coeff1);
    123         eob = _mm_max_epi16(eob, eob1);
    124       }
    125       n_coeffs += 8 * 2;
    126     }
    127 
    128     // AC only loop
    129     while (n_coeffs < 0) {
    130       __m128i coeff0, coeff1;
    131       {
    132         __m128i coeff0_sign, coeff1_sign;
    133         __m128i qcoeff0, qcoeff1;
    134         __m128i qtmp0, qtmp1;
    135         __m128i cmp_mask0, cmp_mask1;
    136 
    137         coeff0 = load_tran_low(coeff_ptr + n_coeffs);
    138         coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
    139 
    140         // Poor man's sign extract
    141         coeff0_sign = _mm_srai_epi16(coeff0, 15);
    142         coeff1_sign = _mm_srai_epi16(coeff1, 15);
    143         qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
    144         qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
    145         qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
    146         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
    147 
    148         cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
    149         cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
    150         qcoeff0 = _mm_adds_epi16(qcoeff0, round);
    151         qcoeff1 = _mm_adds_epi16(qcoeff1, round);
    152         qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
    153         qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
    154         qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
    155         qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
    156         qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
    157         qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
    158 
    159         // Reinsert signs
    160         qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
    161         qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
    162         qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
    163         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
    164 
    165         // Mask out zbin threshold coeffs
    166         qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
    167         qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
    168 
    169         store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
    170         store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
    171 
    172         coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
    173         coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
    174 
    175         store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
    176         store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
    177       }
    178 
    179       {
    180         // Scan for eob
    181         __m128i zero_coeff0, zero_coeff1;
    182         __m128i nzero_coeff0, nzero_coeff1;
    183         __m128i iscan0, iscan1;
    184         __m128i eob0, eob1;
    185         zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
    186         zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
    187         nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
    188         nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
    189         iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
    190         iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
    191         // Add one to convert from indices to counts
    192         iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
    193         iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
    194         eob0 = _mm_and_si128(iscan0, nzero_coeff0);
    195         eob1 = _mm_and_si128(iscan1, nzero_coeff1);
    196         eob0 = _mm_max_epi16(eob0, eob1);
    197         eob = _mm_max_epi16(eob, eob0);
    198       }
    199       n_coeffs += 8 * 2;
    200     }
    201 
    202     // Accumulate EOB
    203     {
    204       __m128i eob_shuffled;
    205       eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
    206       eob = _mm_max_epi16(eob, eob_shuffled);
    207       eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
    208       eob = _mm_max_epi16(eob, eob_shuffled);
    209       eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
    210       eob = _mm_max_epi16(eob, eob_shuffled);
    211       *eob_ptr = _mm_extract_epi16(eob, 1);
    212     }
    213   } else {
    214     do {
    215       store_tran_low(zero, dqcoeff_ptr + n_coeffs);
    216       store_tran_low(zero, dqcoeff_ptr + n_coeffs + 8);
    217       store_tran_low(zero, qcoeff_ptr + n_coeffs);
    218       store_tran_low(zero, qcoeff_ptr + n_coeffs + 8);
    219       n_coeffs += 8 * 2;
    220     } while (n_coeffs < 0);
    221     *eob_ptr = 0;
    222   }
    223 }
    224