1 /* 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 12 #include "vpx_config.h" 13 #include "vp8_rtcd.h" 14 #include "vpx_ports/x86.h" 15 #include "vpx_mem/vpx_mem.h" 16 #include "vp8/encoder/block.h" 17 #include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */ 18 19 #include <mmintrin.h> /* MMX */ 20 #include <xmmintrin.h> /* SSE */ 21 #include <emmintrin.h> /* SSE2 */ 22 23 #define SELECT_EOB(i, z) \ 24 do { \ 25 short boost = *zbin_boost_ptr; \ 26 int cmp = (x[z] < boost) | (y[z] == 0); \ 27 zbin_boost_ptr++; \ 28 if (cmp) \ 29 goto select_eob_end_##i; \ 30 qcoeff_ptr[z] = y[z]; \ 31 eob = i; \ 32 zbin_boost_ptr = b->zrun_zbin_boost; \ 33 select_eob_end_##i:; \ 34 } while (0) 35 36 void vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d) 37 { 38 char eob = 0; 39 short *zbin_boost_ptr = b->zrun_zbin_boost; 40 short *qcoeff_ptr = d->qcoeff; 41 DECLARE_ALIGNED_ARRAY(16, short, x, 16); 42 DECLARE_ALIGNED_ARRAY(16, short, y, 16); 43 44 __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1; 45 __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift)); 46 __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8)); 47 __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); 48 __m128i z1 = _mm_load_si128((__m128i *)(b->coeff+8)); 49 __m128i zbin_extra = _mm_cvtsi32_si128(b->zbin_extra); 50 __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin)); 51 __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8)); 52 __m128i round0 = _mm_load_si128((__m128i *)(b->round)); 53 __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); 54 __m128i quant0 = _mm_load_si128((__m128i *)(b->quant)); 55 __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8)); 56 __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); 57 __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); 58 59 vpx_memset(qcoeff_ptr, 0, 32); 60 61 /* Duplicate to all lanes. */ 62 zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0); 63 zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra); 64 65 /* Sign of z: z >> 15 */ 66 sz0 = _mm_srai_epi16(z0, 15); 67 sz1 = _mm_srai_epi16(z1, 15); 68 69 /* x = abs(z): (z ^ sz) - sz */ 70 x0 = _mm_xor_si128(z0, sz0); 71 x1 = _mm_xor_si128(z1, sz1); 72 x0 = _mm_sub_epi16(x0, sz0); 73 x1 = _mm_sub_epi16(x1, sz1); 74 75 /* zbin[] + zbin_extra */ 76 zbin0 = _mm_add_epi16(zbin0, zbin_extra); 77 zbin1 = _mm_add_epi16(zbin1, zbin_extra); 78 79 /* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance 80 * the equation because boost is the only value which can change: 81 * x - (zbin[] + extra) >= boost */ 82 x_minus_zbin0 = _mm_sub_epi16(x0, zbin0); 83 x_minus_zbin1 = _mm_sub_epi16(x1, zbin1); 84 85 _mm_store_si128((__m128i *)(x), x_minus_zbin0); 86 _mm_store_si128((__m128i *)(x + 8), x_minus_zbin1); 87 88 /* All the remaining calculations are valid whether they are done now with 89 * simd or later inside the loop one at a time. */ 90 x0 = _mm_add_epi16(x0, round0); 91 x1 = _mm_add_epi16(x1, round1); 92 93 y0 = _mm_mulhi_epi16(x0, quant0); 94 y1 = _mm_mulhi_epi16(x1, quant1); 95 96 y0 = _mm_add_epi16(y0, x0); 97 y1 = _mm_add_epi16(y1, x1); 98 99 /* Instead of shifting each value independently we convert the scaling 100 * factor with 1 << (16 - shift) so we can use multiply/return high half. */ 101 y0 = _mm_mulhi_epi16(y0, quant_shift0); 102 y1 = _mm_mulhi_epi16(y1, quant_shift1); 103 104 /* Return the sign: (y ^ sz) - sz */ 105 y0 = _mm_xor_si128(y0, sz0); 106 y1 = _mm_xor_si128(y1, sz1); 107 y0 = _mm_sub_epi16(y0, sz0); 108 y1 = _mm_sub_epi16(y1, sz1); 109 110 _mm_store_si128((__m128i *)(y), y0); 111 _mm_store_si128((__m128i *)(y + 8), y1); 112 113 zbin_boost_ptr = b->zrun_zbin_boost; 114 115 /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */ 116 SELECT_EOB(1, 0); 117 SELECT_EOB(2, 1); 118 SELECT_EOB(3, 4); 119 SELECT_EOB(4, 8); 120 SELECT_EOB(5, 5); 121 SELECT_EOB(6, 2); 122 SELECT_EOB(7, 3); 123 SELECT_EOB(8, 6); 124 SELECT_EOB(9, 9); 125 SELECT_EOB(10, 12); 126 SELECT_EOB(11, 13); 127 SELECT_EOB(12, 10); 128 SELECT_EOB(13, 7); 129 SELECT_EOB(14, 11); 130 SELECT_EOB(15, 14); 131 SELECT_EOB(16, 15); 132 133 y0 = _mm_load_si128((__m128i *)(d->qcoeff)); 134 y1 = _mm_load_si128((__m128i *)(d->qcoeff + 8)); 135 136 /* dqcoeff = qcoeff * dequant */ 137 y0 = _mm_mullo_epi16(y0, dequant0); 138 y1 = _mm_mullo_epi16(y1, dequant1); 139 140 _mm_store_si128((__m128i *)(d->dqcoeff), y0); 141 _mm_store_si128((__m128i *)(d->dqcoeff + 8), y1); 142 143 *d->eob = eob; 144 } 145 146 void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d) 147 { 148 __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); 149 __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8)); 150 __m128i round0 = _mm_load_si128((__m128i *)(b->round)); 151 __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); 152 __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast)); 153 __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8)); 154 __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); 155 __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); 156 __m128i inv_zig_zag0 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag)); 157 __m128i inv_zig_zag1 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag + 8)); 158 159 __m128i sz0, sz1, x0, x1, y0, y1, xdq0, xdq1, zeros, ones; 160 161 /* sign of z: z >> 15 */ 162 sz0 = _mm_srai_epi16(z0, 15); 163 sz1 = _mm_srai_epi16(z1, 15); 164 165 /* x = abs(z): (z ^ sz) - sz */ 166 x0 = _mm_xor_si128(z0, sz0); 167 x1 = _mm_xor_si128(z1, sz1); 168 x0 = _mm_sub_epi16(x0, sz0); 169 x1 = _mm_sub_epi16(x1, sz1); 170 171 /* x += round */ 172 x0 = _mm_add_epi16(x0, round0); 173 x1 = _mm_add_epi16(x1, round1); 174 175 /* y = (x * quant) >> 16 */ 176 y0 = _mm_mulhi_epi16(x0, quant_fast0); 177 y1 = _mm_mulhi_epi16(x1, quant_fast1); 178 179 /* x = abs(y) = (y ^ sz) - sz */ 180 y0 = _mm_xor_si128(y0, sz0); 181 y1 = _mm_xor_si128(y1, sz1); 182 x0 = _mm_sub_epi16(y0, sz0); 183 x1 = _mm_sub_epi16(y1, sz1); 184 185 /* qcoeff = x */ 186 _mm_store_si128((__m128i *)(d->qcoeff), x0); 187 _mm_store_si128((__m128i *)(d->qcoeff + 8), x1); 188 189 /* x * dequant */ 190 xdq0 = _mm_mullo_epi16(x0, dequant0); 191 xdq1 = _mm_mullo_epi16(x1, dequant1); 192 193 /* dqcoeff = x * dequant */ 194 _mm_store_si128((__m128i *)(d->dqcoeff), xdq0); 195 _mm_store_si128((__m128i *)(d->dqcoeff + 8), xdq1); 196 197 /* build a mask for the zig zag */ 198 zeros = _mm_setzero_si128(); 199 200 x0 = _mm_cmpeq_epi16(x0, zeros); 201 x1 = _mm_cmpeq_epi16(x1, zeros); 202 203 ones = _mm_cmpeq_epi16(zeros, zeros); 204 205 x0 = _mm_xor_si128(x0, ones); 206 x1 = _mm_xor_si128(x1, ones); 207 208 x0 = _mm_and_si128(x0, inv_zig_zag0); 209 x1 = _mm_and_si128(x1, inv_zig_zag1); 210 211 x0 = _mm_max_epi16(x0, x1); 212 213 /* now down to 8 */ 214 x1 = _mm_shuffle_epi32(x0, 0xE); // 0b00001110 215 216 x0 = _mm_max_epi16(x0, x1); 217 218 /* only 4 left */ 219 x1 = _mm_shufflelo_epi16(x0, 0xE); // 0b00001110 220 221 x0 = _mm_max_epi16(x0, x1); 222 223 /* okay, just 2! */ 224 x1 = _mm_shufflelo_epi16(x0, 0x1); // 0b00000001 225 226 x0 = _mm_max_epi16(x0, x1); 227 228 *d->eob = 0xFF & _mm_cvtsi128_si32(x0); 229 } 230