1 /* 2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <arm_neon.h> 12 13 #include "./vpx_dsp_rtcd.h" 14 #include "./vpx_config.h" 15 #include "vpx_dsp/arm/mem_neon.h" 16 #include "vpx_dsp/arm/sum_neon.h" 17 18 static INLINE tran_low_t get_lane(const int32x2_t a) { 19 #if CONFIG_VP9_HIGHBITDEPTH 20 return vget_lane_s32(a, 0); 21 #else 22 return vget_lane_s16(vreinterpret_s16_s32(a), 0); 23 #endif // CONFIG_VP9_HIGHBITDETPH 24 } 25 26 void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride) { 27 int16x4_t a0, a1, a2, a3; 28 int16x8_t b0, b1; 29 int16x8_t c; 30 int32x2_t d; 31 32 a0 = vld1_s16(input); 33 input += stride; 34 a1 = vld1_s16(input); 35 input += stride; 36 a2 = vld1_s16(input); 37 input += stride; 38 a3 = vld1_s16(input); 39 40 b0 = vcombine_s16(a0, a1); 41 b1 = vcombine_s16(a2, a3); 42 43 c = vaddq_s16(b0, b1); 44 45 d = horizontal_add_int16x8(c); 46 47 output[0] = get_lane(vshl_n_s32(d, 1)); 48 output[1] = 0; 49 } 50 51 void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) { 52 int r; 53 int16x8_t sum = vld1q_s16(&input[0]); 54 55 for (r = 1; r < 8; ++r) { 56 const int16x8_t input_00 = vld1q_s16(&input[r * stride]); 57 sum = vaddq_s16(sum, input_00); 58 } 59 60 output[0] = get_lane(horizontal_add_int16x8(sum)); 61 output[1] = 0; 62 } 63 64 void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, 65 int stride) { 66 int r; 67 int16x8_t left = vld1q_s16(input); 68 int16x8_t right = vld1q_s16(input + 8); 69 int32x2_t sum; 70 input += stride; 71 72 for (r = 1; r < 16; ++r) { 73 const int16x8_t a = vld1q_s16(input); 74 const int16x8_t b = vld1q_s16(input + 8); 75 input += stride; 76 left = vaddq_s16(left, a); 77 right = vaddq_s16(right, b); 78 } 79 80 sum = vadd_s32(horizontal_add_int16x8(left), horizontal_add_int16x8(right)); 81 82 output[0] = get_lane(vshr_n_s32(sum, 1)); 83 output[1] = 0; 84 } 85 86 void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output, 87 int stride) { 88 int r; 89 int16x8_t a0 = vld1q_s16(input); 90 int16x8_t a1 = vld1q_s16(input + 8); 91 int16x8_t a2 = vld1q_s16(input + 16); 92 int16x8_t a3 = vld1q_s16(input + 24); 93 int32x2_t sum; 94 input += stride; 95 96 for (r = 1; r < 32; ++r) { 97 const int16x8_t b0 = vld1q_s16(input); 98 const int16x8_t b1 = vld1q_s16(input + 8); 99 const int16x8_t b2 = vld1q_s16(input + 16); 100 const int16x8_t b3 = vld1q_s16(input + 24); 101 input += stride; 102 a0 = vaddq_s16(a0, b0); 103 a1 = vaddq_s16(a1, b1); 104 a2 = vaddq_s16(a2, b2); 105 a3 = vaddq_s16(a3, b3); 106 } 107 108 sum = vadd_s32(horizontal_add_int16x8(a0), horizontal_add_int16x8(a1)); 109 sum = vadd_s32(sum, horizontal_add_int16x8(a2)); 110 sum = vadd_s32(sum, horizontal_add_int16x8(a3)); 111 output[0] = get_lane(vshr_n_s32(sum, 3)); 112 output[1] = 0; 113 } 114