Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <tmmintrin.h> /* SSSE3 */
     12 
     13 #include "vp8/encoder/block.h"
     14 
     15 /* bitscan reverse (bsr) */
     16 #if defined(_MSC_VER)
     17 #include <intrin.h>
     18 #pragma intrinsic(_BitScanReverse)
     19 static int bsr(int mask) {
     20   int eob;
     21   _BitScanReverse(&eob, mask);
     22   eob++;
     23   if (mask == 0)
     24     eob = 0;
     25   return eob;
     26 }
     27 #else
     28 static int bsr(int mask) {
     29   int eob;
     30 #if defined(__GNUC__) && __GNUC__
     31   __asm__ __volatile__("bsr %1, %0" : "=r" (eob) : "r" (mask) : "flags");
     32 #elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
     33   asm volatile("bsr %1, %0" : "=r" (eob) : "r" (mask) : "flags");
     34 #endif
     35   eob++;
     36   if (mask == 0)
     37     eob = 0;
     38   return eob;
     39 }
     40 #endif
     41 
     42 void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d) {
     43   int eob, mask;
     44 
     45   __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));
     46   __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8));
     47   __m128i round0 = _mm_load_si128((__m128i *)(b->round));
     48   __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8));
     49   __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast));
     50   __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8));
     51   __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
     52   __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
     53 
     54   __m128i sz0, sz1, x, x0, x1, y0, y1, zeros, abs0, abs1;
     55 
     56   DECLARE_ALIGNED(16, const uint8_t, pshufb_zig_zag_mask[16]) =
     57     { 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 };
     58   __m128i zig_zag = _mm_load_si128((const __m128i *)pshufb_zig_zag_mask);
     59 
     60   /* sign of z: z >> 15 */
     61   sz0 = _mm_srai_epi16(z0, 15);
     62   sz1 = _mm_srai_epi16(z1, 15);
     63 
     64   /* x = abs(z) */
     65   x0 = _mm_abs_epi16(z0);
     66   x1 = _mm_abs_epi16(z1);
     67 
     68   /* x += round */
     69   x0 = _mm_add_epi16(x0, round0);
     70   x1 = _mm_add_epi16(x1, round1);
     71 
     72   /* y = (x * quant) >> 16 */
     73   y0 = _mm_mulhi_epi16(x0, quant_fast0);
     74   y1 = _mm_mulhi_epi16(x1, quant_fast1);
     75 
     76   /* ASM saves Y for EOB */
     77   /* I think we can ignore that because adding the sign doesn't change anything
     78    * and multiplying 0 by dequant is OK as well */
     79   abs0 = y0;
     80   abs1 = y1;
     81 
     82   /* Restore the sign bit. */
     83   y0 = _mm_xor_si128(y0, sz0);
     84   y1 = _mm_xor_si128(y1, sz1);
     85   x0 = _mm_sub_epi16(y0, sz0);
     86   x1 = _mm_sub_epi16(y1, sz1);
     87 
     88   /* qcoeff = x */
     89   _mm_store_si128((__m128i *)(d->qcoeff), x0);
     90   _mm_store_si128((__m128i *)(d->qcoeff + 8), x1);
     91 
     92   /* x * dequant */
     93   x0 = _mm_mullo_epi16(x0, dequant0);
     94   x1 = _mm_mullo_epi16(x1, dequant1);
     95 
     96   /* dqcoeff = x * dequant */
     97   _mm_store_si128((__m128i *)(d->dqcoeff), x0);
     98   _mm_store_si128((__m128i *)(d->dqcoeff + 8), x1);
     99 
    100   zeros = _mm_setzero_si128();
    101 
    102   x0 = _mm_cmpgt_epi16(abs0, zeros);
    103   x1 = _mm_cmpgt_epi16(abs1, zeros);
    104 
    105   x = _mm_packs_epi16(x0, x1);
    106 
    107   x = _mm_shuffle_epi8(x, zig_zag);
    108 
    109   mask = _mm_movemask_epi8(x);
    110 
    111   eob = bsr(mask);
    112 
    113   *d->eob = 0xFF & eob;
    114 }
    115