1 /* 2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 /* 12 * filters_neon.c 13 * 14 * This file contains function WebRtcIsacfix_AutocorrNeon, optimized for 15 * ARM Neon platform. 16 * 17 */ 18 19 #include <arm_neon.h> 20 #include <assert.h> 21 22 #include "codec.h" 23 24 // Autocorrelation function in fixed point. 25 // NOTE! Different from SPLIB-version in how it scales the signal. 26 int WebRtcIsacfix_AutocorrNeon( 27 WebRtc_Word32* __restrict r, 28 const WebRtc_Word16* __restrict x, 29 WebRtc_Word16 N, 30 WebRtc_Word16 order, 31 WebRtc_Word16* __restrict scale) { 32 33 // The 1st for loop assumed N % 4 == 0. 34 assert(N % 4 == 0); 35 36 int i = 0; 37 int zeros_low = 0; 38 int zeros_high = 0; 39 int16_t scaling = 0; 40 int32_t sum = 0; 41 42 // Step 1, calculate r[0] and how much scaling is needed. 43 44 int16x4_t reg16x4; 45 int64x1_t reg64x1a; 46 int64x1_t reg64x1b; 47 int32x4_t reg32x4; 48 int64x2_t reg64x2 = vdupq_n_s64(0); // zeros 49 50 // Loop over the samples and do: 51 // sum += WEBRTC_SPL_MUL_16_16(x[i], x[i]); 52 for (i = 0; i < N; i += 4) { 53 reg16x4 = vld1_s16(&x[i]); 54 reg32x4 = vmull_s16(reg16x4, reg16x4); 55 reg64x2 = vpadalq_s32(reg64x2, reg32x4); 56 } 57 reg64x1a = vget_low_s64(reg64x2); 58 reg64x1b = vget_high_s64(reg64x2); 59 reg64x1a = vadd_s64(reg64x1a, reg64x1b); 60 61 // Calculate the value of shifting (scaling). 62 __asm__ __volatile__( 63 "vmov %[z_l], %[z_h], %P[reg]\n\t" 64 "clz %[z_l], %[z_l]\n\t" 65 "clz %[z_h], %[z_h]\n\t" 66 :[z_l]"+r"(zeros_low), 67 [z_h]"+r"(zeros_high) 68 :[reg]"w"(reg64x1a) 69 ); 70 if (zeros_high != 32) { 71 scaling = (32 - zeros_high + 1); 72 } else if (zeros_low == 0) { 73 scaling = 1; 74 } 75 reg64x1b = -scaling; 76 reg64x1a = vshl_s64(reg64x1a, reg64x1b); 77 78 // Record the result. 79 r[0] = (int32_t)vget_lane_s64(reg64x1a, 0); 80 81 82 // Step 2, perform the actual correlation calculation. 83 84 /* Original C code (for the rest of the function): 85 for (i = 1; i < order + 1; i++) { 86 prod = 0; 87 for (j = 0; j < N - i; j++) { 88 prod += WEBRTC_SPL_MUL_16_16(x[j], x[i + j]); 89 } 90 sum = (int32_t)(prod >> scaling); 91 r[i] = sum; 92 } 93 */ 94 95 for (i = 1; i < order + 1; i++) { 96 int32_t prod_lower = 0; 97 int32_t prod_upper = 0; 98 const int16_t* ptr0 = &x[0]; 99 const int16_t* ptr1 = &x[i]; 100 int32_t tmp = 0; 101 102 // Initialize the sum (q9) to zero. 103 __asm__ __volatile__("vmov.i32 q9, #0\n\t":::"q9"); 104 105 // Calculate the major block of the samples (a multiple of 8). 106 for (; ptr0 < &x[N - i - 7];) { 107 __asm__ __volatile__( 108 "vld1.16 {d20, d21}, [%[ptr0]]!\n\t" 109 "vld1.16 {d22, d23}, [%[ptr1]]!\n\t" 110 "vmull.s16 q12, d20, d22\n\t" 111 "vmull.s16 q13, d21, d23\n\t" 112 "vpadal.s32 q9, q12\n\t" 113 "vpadal.s32 q9, q13\n\t" 114 115 // Specify constraints. 116 :[ptr0]"+r"(ptr0), 117 [ptr1]"+r"(ptr1) 118 : 119 :"d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27" 120 ); 121 } 122 123 // Calculate the rest of the samples. 124 for (; ptr0 < &x[N - i]; ptr0++, ptr1++) { 125 __asm__ __volatile__( 126 "smulbb %[tmp], %[ptr0], %[ptr1]\n\t" 127 "adds %[prod_lower], %[prod_lower], %[tmp]\n\t" 128 "adc %[prod_upper], %[prod_upper], %[tmp], asr #31\n\t" 129 130 // Specify constraints. 131 :[prod_lower]"+r"(prod_lower), 132 [prod_upper]"+r"(prod_upper), 133 [tmp]"+r"(tmp) 134 :[ptr0]"r"(*ptr0), 135 [ptr1]"r"(*ptr1) 136 ); 137 } 138 139 // Sum the results up, and do shift. 140 __asm__ __volatile__( 141 "vadd.i64 d18, d19\n\t" 142 "vmov.32 d17[0], %[prod_lower]\n\t" 143 "vmov.32 d17[1], %[prod_upper]\n\t" 144 "vadd.i64 d17, d18\n\t" 145 "mov %[tmp], %[scaling], asr #31\n\t" 146 "vmov.32 d16, %[scaling], %[tmp]\n\t" 147 "vshl.s64 d17, d16\n\t" 148 "vmov.32 %[sum], d17[0]\n\t" 149 150 // Specify constraints. 151 :[sum]"=r"(sum), 152 [tmp]"+r"(tmp) 153 :[prod_upper]"r"(prod_upper), 154 [prod_lower]"r"(prod_lower), 155 [scaling]"r"(-scaling) 156 :"d16", "d17", "d18", "d19" 157 ); 158 159 // Record the result. 160 r[i] = sum; 161 } 162 163 // Record the result. 164 *scale = scaling; 165 166 return(order + 1); 167 } 168