1 /* 2 * Copyright (c) 2016 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <arm_neon.h> 12 13 #include "./vpx_dsp_rtcd.h" 14 #include "vpx/vpx_integer.h" 15 #include "vpx_dsp/arm/idct_neon.h" 16 #include "vpx_dsp/arm/mem_neon.h" 17 #include "vpx_dsp/arm/transpose_neon.h" 18 19 static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, 20 int16x8_t *a3, int16x8_t *a4, int16x8_t *a5, 21 int16x8_t *a6, int16x8_t *a7) { 22 const int16x8_t b0 = vaddq_s16(*a0, *a1); 23 const int16x8_t b1 = vsubq_s16(*a0, *a1); 24 const int16x8_t b2 = vaddq_s16(*a2, *a3); 25 const int16x8_t b3 = vsubq_s16(*a2, *a3); 26 const int16x8_t b4 = vaddq_s16(*a4, *a5); 27 const int16x8_t b5 = vsubq_s16(*a4, *a5); 28 const int16x8_t b6 = vaddq_s16(*a6, *a7); 29 const int16x8_t b7 = vsubq_s16(*a6, *a7); 30 31 const int16x8_t c0 = vaddq_s16(b0, b2); 32 const int16x8_t c1 = vaddq_s16(b1, b3); 33 const int16x8_t c2 = vsubq_s16(b0, b2); 34 const int16x8_t c3 = vsubq_s16(b1, b3); 35 const int16x8_t c4 = vaddq_s16(b4, b6); 36 const int16x8_t c5 = vaddq_s16(b5, b7); 37 const int16x8_t c6 = vsubq_s16(b4, b6); 38 const int16x8_t c7 = vsubq_s16(b5, b7); 39 40 *a0 = vaddq_s16(c0, c4); 41 *a1 = vsubq_s16(c2, c6); 42 *a2 = vsubq_s16(c0, c4); 43 *a3 = vaddq_s16(c2, c6); 44 *a4 = vaddq_s16(c3, c7); 45 *a5 = vsubq_s16(c3, c7); 46 *a6 = vsubq_s16(c1, c5); 47 *a7 = vaddq_s16(c1, c5); 48 } 49 50 void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, 51 tran_low_t *coeff) { 52 int16x8_t a0 = vld1q_s16(src_diff); 53 int16x8_t a1 = vld1q_s16(src_diff + src_stride); 54 int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride); 55 int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride); 56 int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride); 57 int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride); 58 int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride); 59 int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride); 60 61 hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); 62 63 transpose_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); 64 65 hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); 66 67 // Skip the second transpose because it is not required. 68 69 store_s16q_to_tran_low(coeff + 0, a0); 70 store_s16q_to_tran_low(coeff + 8, a1); 71 store_s16q_to_tran_low(coeff + 16, a2); 72 store_s16q_to_tran_low(coeff + 24, a3); 73 store_s16q_to_tran_low(coeff + 32, a4); 74 store_s16q_to_tran_low(coeff + 40, a5); 75 store_s16q_to_tran_low(coeff + 48, a6); 76 store_s16q_to_tran_low(coeff + 56, a7); 77 } 78 79 void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride, 80 tran_low_t *coeff) { 81 int i; 82 83 /* Rearrange 16x16 to 8x32 and remove stride. 84 * Top left first. */ 85 vpx_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0); 86 /* Top right. */ 87 vpx_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64); 88 /* Bottom left. */ 89 vpx_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128); 90 /* Bottom right. */ 91 vpx_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192); 92 93 for (i = 0; i < 64; i += 8) { 94 const int16x8_t a0 = load_tran_low_to_s16q(coeff + 0); 95 const int16x8_t a1 = load_tran_low_to_s16q(coeff + 64); 96 const int16x8_t a2 = load_tran_low_to_s16q(coeff + 128); 97 const int16x8_t a3 = load_tran_low_to_s16q(coeff + 192); 98 99 const int16x8_t b0 = vhaddq_s16(a0, a1); 100 const int16x8_t b1 = vhsubq_s16(a0, a1); 101 const int16x8_t b2 = vhaddq_s16(a2, a3); 102 const int16x8_t b3 = vhsubq_s16(a2, a3); 103 104 const int16x8_t c0 = vaddq_s16(b0, b2); 105 const int16x8_t c1 = vaddq_s16(b1, b3); 106 const int16x8_t c2 = vsubq_s16(b0, b2); 107 const int16x8_t c3 = vsubq_s16(b1, b3); 108 109 store_s16q_to_tran_low(coeff + 0, c0); 110 store_s16q_to_tran_low(coeff + 64, c1); 111 store_s16q_to_tran_low(coeff + 128, c2); 112 store_s16q_to_tran_low(coeff + 192, c3); 113 114 coeff += 8; 115 } 116 } 117