1 /* 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <tmmintrin.h> /* SSSE3 */ 12 13 #include "vp8/encoder/block.h" 14 15 /* bitscan reverse (bsr) */ 16 #if defined(_MSC_VER) 17 #include <intrin.h> 18 #pragma intrinsic(_BitScanReverse) 19 static int bsr(int mask) { 20 int eob; 21 _BitScanReverse(&eob, mask); 22 eob++; 23 if (mask == 0) 24 eob = 0; 25 return eob; 26 } 27 #else 28 static int bsr(int mask) { 29 int eob; 30 #if defined(__GNUC__) && __GNUC__ 31 __asm__ __volatile__("bsr %1, %0" : "=r" (eob) : "r" (mask) : "flags"); 32 #elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) 33 asm volatile("bsr %1, %0" : "=r" (eob) : "r" (mask) : "flags"); 34 #endif 35 eob++; 36 if (mask == 0) 37 eob = 0; 38 return eob; 39 } 40 #endif 41 42 void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d) { 43 int eob, mask; 44 45 __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); 46 __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8)); 47 __m128i round0 = _mm_load_si128((__m128i *)(b->round)); 48 __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); 49 __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast)); 50 __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8)); 51 __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); 52 __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); 53 54 __m128i sz0, sz1, x, x0, x1, y0, y1, zeros, abs0, abs1; 55 56 DECLARE_ALIGNED(16, const uint8_t, pshufb_zig_zag_mask[16]) = 57 { 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 }; 58 __m128i zig_zag = _mm_load_si128((const __m128i *)pshufb_zig_zag_mask); 59 60 /* sign of z: z >> 15 */ 61 sz0 = _mm_srai_epi16(z0, 15); 62 sz1 = _mm_srai_epi16(z1, 15); 63 64 /* x = abs(z) */ 65 x0 = _mm_abs_epi16(z0); 66 x1 = _mm_abs_epi16(z1); 67 68 /* x += round */ 69 x0 = _mm_add_epi16(x0, round0); 70 x1 = _mm_add_epi16(x1, round1); 71 72 /* y = (x * quant) >> 16 */ 73 y0 = _mm_mulhi_epi16(x0, quant_fast0); 74 y1 = _mm_mulhi_epi16(x1, quant_fast1); 75 76 /* ASM saves Y for EOB */ 77 /* I think we can ignore that because adding the sign doesn't change anything 78 * and multiplying 0 by dequant is OK as well */ 79 abs0 = y0; 80 abs1 = y1; 81 82 /* Restore the sign bit. */ 83 y0 = _mm_xor_si128(y0, sz0); 84 y1 = _mm_xor_si128(y1, sz1); 85 x0 = _mm_sub_epi16(y0, sz0); 86 x1 = _mm_sub_epi16(y1, sz1); 87 88 /* qcoeff = x */ 89 _mm_store_si128((__m128i *)(d->qcoeff), x0); 90 _mm_store_si128((__m128i *)(d->qcoeff + 8), x1); 91 92 /* x * dequant */ 93 x0 = _mm_mullo_epi16(x0, dequant0); 94 x1 = _mm_mullo_epi16(x1, dequant1); 95 96 /* dqcoeff = x * dequant */ 97 _mm_store_si128((__m128i *)(d->dqcoeff), x0); 98 _mm_store_si128((__m128i *)(d->dqcoeff + 8), x1); 99 100 zeros = _mm_setzero_si128(); 101 102 x0 = _mm_cmpgt_epi16(abs0, zeros); 103 x1 = _mm_cmpgt_epi16(abs1, zeros); 104 105 x = _mm_packs_epi16(x0, x1); 106 107 x = _mm_shuffle_epi8(x, zig_zag); 108 109 mask = _mm_movemask_epi8(x); 110 111 eob = bsr(mask); 112 113 *d->eob = 0xFF & eob; 114 } 115