Home | History | Annotate | Download | only in neon
      1 /*
      2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <arm_neon.h>
     12 
     13 #include "./vp8_rtcd.h"
     14 #include "vp8/encoder/block.h"
     15 
     16 static const uint16_t inv_zig_zag[16] = { 1, 2, 6,  7,  3,  5,  8,  13,
     17                                           4, 9, 12, 14, 10, 11, 15, 16 };
     18 
     19 void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) {
     20   const int16x8_t one_q = vdupq_n_s16(-1), z0 = vld1q_s16(b->coeff),
     21                   z1 = vld1q_s16(b->coeff + 8), round0 = vld1q_s16(b->round),
     22                   round1 = vld1q_s16(b->round + 8),
     23                   quant0 = vld1q_s16(b->quant_fast),
     24                   quant1 = vld1q_s16(b->quant_fast + 8),
     25                   dequant0 = vld1q_s16(d->dequant),
     26                   dequant1 = vld1q_s16(d->dequant + 8);
     27   const uint16x8_t zig_zag0 = vld1q_u16(inv_zig_zag),
     28                    zig_zag1 = vld1q_u16(inv_zig_zag + 8);
     29   int16x8_t x0, x1, sz0, sz1, y0, y1;
     30   uint16x8_t eob0, eob1;
     31 #ifndef __aarch64__
     32   uint16x4_t eob_d16;
     33   uint32x2_t eob_d32;
     34   uint32x4_t eob_q32;
     35 #endif  // __arch64__
     36 
     37   /* sign of z: z >> 15 */
     38   sz0 = vshrq_n_s16(z0, 15);
     39   sz1 = vshrq_n_s16(z1, 15);
     40 
     41   /* x = abs(z) */
     42   x0 = vabsq_s16(z0);
     43   x1 = vabsq_s16(z1);
     44 
     45   /* x += round */
     46   x0 = vaddq_s16(x0, round0);
     47   x1 = vaddq_s16(x1, round1);
     48 
     49   /* y = 2 * (x * quant) >> 16 */
     50   y0 = vqdmulhq_s16(x0, quant0);
     51   y1 = vqdmulhq_s16(x1, quant1);
     52 
     53   /* Compensate for doubling in vqdmulhq */
     54   y0 = vshrq_n_s16(y0, 1);
     55   y1 = vshrq_n_s16(y1, 1);
     56 
     57   /* Restore sign bit */
     58   y0 = veorq_s16(y0, sz0);
     59   y1 = veorq_s16(y1, sz1);
     60   x0 = vsubq_s16(y0, sz0);
     61   x1 = vsubq_s16(y1, sz1);
     62 
     63   /* find non-zero elements */
     64   eob0 = vtstq_s16(x0, one_q);
     65   eob1 = vtstq_s16(x1, one_q);
     66 
     67   /* mask zig zag */
     68   eob0 = vandq_u16(eob0, zig_zag0);
     69   eob1 = vandq_u16(eob1, zig_zag1);
     70 
     71   /* select the largest value */
     72   eob0 = vmaxq_u16(eob0, eob1);
     73 #ifdef __aarch64__
     74   *d->eob = (int8_t)vmaxvq_u16(eob0);
     75 #else
     76   eob_d16 = vmax_u16(vget_low_u16(eob0), vget_high_u16(eob0));
     77   eob_q32 = vmovl_u16(eob_d16);
     78   eob_d32 = vmax_u32(vget_low_u32(eob_q32), vget_high_u32(eob_q32));
     79   eob_d32 = vpmax_u32(eob_d32, eob_d32);
     80 
     81   vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0);
     82 #endif  // __aarch64__
     83 
     84   /* qcoeff = x */
     85   vst1q_s16(d->qcoeff, x0);
     86   vst1q_s16(d->qcoeff + 8, x1);
     87 
     88   /* dqcoeff = x * dequant */
     89   vst1q_s16(d->dqcoeff, vmulq_s16(dequant0, x0));
     90   vst1q_s16(d->dqcoeff + 8, vmulq_s16(dequant1, x1));
     91 }
     92