Home | History | Annotate | Download | only in arm
      1 /*
      2  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <arm_neon.h>
     12 
     13 #include "./vpx_dsp_rtcd.h"
     14 #include "./vpx_config.h"
     15 #include "vpx_dsp/arm/mem_neon.h"
     16 #include "vpx_dsp/arm/sum_neon.h"
     17 
     18 static INLINE tran_low_t get_lane(const int32x2_t a) {
     19 #if CONFIG_VP9_HIGHBITDEPTH
     20   return vget_lane_s32(a, 0);
     21 #else
     22   return vget_lane_s16(vreinterpret_s16_s32(a), 0);
     23 #endif  // CONFIG_VP9_HIGHBITDETPH
     24 }
     25 
     26 void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride) {
     27   int16x4_t a0, a1, a2, a3;
     28   int16x8_t b0, b1;
     29   int16x8_t c;
     30   int32x2_t d;
     31 
     32   a0 = vld1_s16(input);
     33   input += stride;
     34   a1 = vld1_s16(input);
     35   input += stride;
     36   a2 = vld1_s16(input);
     37   input += stride;
     38   a3 = vld1_s16(input);
     39 
     40   b0 = vcombine_s16(a0, a1);
     41   b1 = vcombine_s16(a2, a3);
     42 
     43   c = vaddq_s16(b0, b1);
     44 
     45   d = horizontal_add_int16x8(c);
     46 
     47   output[0] = get_lane(vshl_n_s32(d, 1));
     48   output[1] = 0;
     49 }
     50 
     51 void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) {
     52   int r;
     53   int16x8_t sum = vld1q_s16(&input[0]);
     54 
     55   for (r = 1; r < 8; ++r) {
     56     const int16x8_t input_00 = vld1q_s16(&input[r * stride]);
     57     sum = vaddq_s16(sum, input_00);
     58   }
     59 
     60   output[0] = get_lane(horizontal_add_int16x8(sum));
     61   output[1] = 0;
     62 }
     63 
     64 void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output,
     65                           int stride) {
     66   int r;
     67   int16x8_t left = vld1q_s16(input);
     68   int16x8_t right = vld1q_s16(input + 8);
     69   int32x2_t sum;
     70   input += stride;
     71 
     72   for (r = 1; r < 16; ++r) {
     73     const int16x8_t a = vld1q_s16(input);
     74     const int16x8_t b = vld1q_s16(input + 8);
     75     input += stride;
     76     left = vaddq_s16(left, a);
     77     right = vaddq_s16(right, b);
     78   }
     79 
     80   sum = vadd_s32(horizontal_add_int16x8(left), horizontal_add_int16x8(right));
     81 
     82   output[0] = get_lane(vshr_n_s32(sum, 1));
     83   output[1] = 0;
     84 }
     85 
     86 void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output,
     87                           int stride) {
     88   int r;
     89   int16x8_t a0 = vld1q_s16(input);
     90   int16x8_t a1 = vld1q_s16(input + 8);
     91   int16x8_t a2 = vld1q_s16(input + 16);
     92   int16x8_t a3 = vld1q_s16(input + 24);
     93   int32x2_t sum;
     94   input += stride;
     95 
     96   for (r = 1; r < 32; ++r) {
     97     const int16x8_t b0 = vld1q_s16(input);
     98     const int16x8_t b1 = vld1q_s16(input + 8);
     99     const int16x8_t b2 = vld1q_s16(input + 16);
    100     const int16x8_t b3 = vld1q_s16(input + 24);
    101     input += stride;
    102     a0 = vaddq_s16(a0, b0);
    103     a1 = vaddq_s16(a1, b1);
    104     a2 = vaddq_s16(a2, b2);
    105     a3 = vaddq_s16(a3, b3);
    106   }
    107 
    108   sum = vadd_s32(horizontal_add_int16x8(a0), horizontal_add_int16x8(a1));
    109   sum = vadd_s32(sum, horizontal_add_int16x8(a2));
    110   sum = vadd_s32(sum, horizontal_add_int16x8(a3));
    111   output[0] = get_lane(vshr_n_s32(sum, 3));
    112   output[1] = 0;
    113 }
    114