1 /* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <arm_neon.h> 12 #include "vp8/encoder/block.h" 13 14 static const uint16_t inv_zig_zag[16] = { 1, 2, 6, 7, 3, 5, 8, 13, 15 4, 9, 12, 14, 10, 11, 15, 16 }; 16 17 void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) { 18 const int16x8_t one_q = vdupq_n_s16(-1), z0 = vld1q_s16(b->coeff), 19 z1 = vld1q_s16(b->coeff + 8), round0 = vld1q_s16(b->round), 20 round1 = vld1q_s16(b->round + 8), 21 quant0 = vld1q_s16(b->quant_fast), 22 quant1 = vld1q_s16(b->quant_fast + 8), 23 dequant0 = vld1q_s16(d->dequant), 24 dequant1 = vld1q_s16(d->dequant + 8); 25 const uint16x8_t zig_zag0 = vld1q_u16(inv_zig_zag), 26 zig_zag1 = vld1q_u16(inv_zig_zag + 8); 27 int16x8_t x0, x1, sz0, sz1, y0, y1; 28 uint16x8_t eob0, eob1; 29 uint16x4_t eob_d16; 30 uint32x2_t eob_d32; 31 uint32x4_t eob_q32; 32 33 /* sign of z: z >> 15 */ 34 sz0 = vshrq_n_s16(z0, 15); 35 sz1 = vshrq_n_s16(z1, 15); 36 37 /* x = abs(z) */ 38 x0 = vabsq_s16(z0); 39 x1 = vabsq_s16(z1); 40 41 /* x += round */ 42 x0 = vaddq_s16(x0, round0); 43 x1 = vaddq_s16(x1, round1); 44 45 /* y = 2 * (x * quant) >> 16 */ 46 y0 = vqdmulhq_s16(x0, quant0); 47 y1 = vqdmulhq_s16(x1, quant1); 48 49 /* Compensate for doubling in vqdmulhq */ 50 y0 = vshrq_n_s16(y0, 1); 51 y1 = vshrq_n_s16(y1, 1); 52 53 /* Restore sign bit */ 54 y0 = veorq_s16(y0, sz0); 55 y1 = veorq_s16(y1, sz1); 56 x0 = vsubq_s16(y0, sz0); 57 x1 = vsubq_s16(y1, sz1); 58 59 /* find non-zero elements */ 60 eob0 = vtstq_s16(x0, one_q); 61 eob1 = vtstq_s16(x1, one_q); 62 63 /* mask zig zag */ 64 eob0 = vandq_u16(eob0, zig_zag0); 65 eob1 = vandq_u16(eob1, zig_zag1); 66 67 /* select the largest value */ 68 eob0 = vmaxq_u16(eob0, eob1); 69 eob_d16 = vmax_u16(vget_low_u16(eob0), vget_high_u16(eob0)); 70 eob_q32 = vmovl_u16(eob_d16); 71 eob_d32 = vmax_u32(vget_low_u32(eob_q32), vget_high_u32(eob_q32)); 72 eob_d32 = vpmax_u32(eob_d32, eob_d32); 73 74 /* qcoeff = x */ 75 vst1q_s16(d->qcoeff, x0); 76 vst1q_s16(d->qcoeff + 8, x1); 77 78 /* dqcoeff = x * dequant */ 79 vst1q_s16(d->dqcoeff, vmulq_s16(dequant0, x0)); 80 vst1q_s16(d->dqcoeff + 8, vmulq_s16(dequant1, x1)); 81 82 vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0); 83 } 84