1 /* 2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <arm_neon.h> 12 #include <assert.h> 13 #include <stdlib.h> 14 15 #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h" 16 17 // Maximum absolute value of word16 vector. C version for generic platforms. 18 int16_t WebRtcSpl_MaxAbsValueW16Neon(const int16_t* vector, size_t length) { 19 int absolute = 0, maximum = 0; 20 21 assert(length > 0); 22 23 const int16_t* p_start = vector; 24 size_t rest = length & 7; 25 const int16_t* p_end = vector + length - rest; 26 27 int16x8_t v; 28 uint16x8_t max_qv; 29 max_qv = vdupq_n_u16(0); 30 31 while (p_start < p_end) { 32 v = vld1q_s16(p_start); 33 // Note vabs doesn't change the value of -32768. 34 v = vabsq_s16(v); 35 // Use u16 so we don't lose the value -32768. 36 max_qv = vmaxq_u16(max_qv, vreinterpretq_u16_s16(v)); 37 p_start += 8; 38 } 39 40 #ifdef WEBRTC_ARCH_ARM64 41 maximum = (int)vmaxvq_u16(max_qv); 42 #else 43 uint16x4_t max_dv; 44 max_dv = vmax_u16(vget_low_u16(max_qv), vget_high_u16(max_qv)); 45 max_dv = vpmax_u16(max_dv, max_dv); 46 max_dv = vpmax_u16(max_dv, max_dv); 47 48 maximum = (int)vget_lane_u16(max_dv, 0); 49 #endif 50 51 p_end = vector + length; 52 while (p_start < p_end) { 53 absolute = abs((int)(*p_start)); 54 55 if (absolute > maximum) { 56 maximum = absolute; 57 } 58 p_start++; 59 } 60 61 // Guard the case for abs(-32768). 62 if (maximum > WEBRTC_SPL_WORD16_MAX) { 63 maximum = WEBRTC_SPL_WORD16_MAX; 64 } 65 66 return (int16_t)maximum; 67 } 68 69 // Maximum absolute value of word32 vector. NEON intrinsics version for 70 // ARM 32-bit/64-bit platforms. 71 int32_t WebRtcSpl_MaxAbsValueW32Neon(const int32_t* vector, size_t length) { 72 // Use uint32_t for the local variables, to accommodate the return value 73 // of abs(0x80000000), which is 0x80000000. 74 75 uint32_t absolute = 0, maximum = 0; 76 size_t i = 0; 77 size_t residual = length & 0x7; 78 79 assert(length > 0); 80 81 const int32_t* p_start = vector; 82 uint32x4_t max32x4_0 = vdupq_n_u32(0); 83 uint32x4_t max32x4_1 = vdupq_n_u32(0); 84 85 // First part, unroll the loop 8 times. 86 for (i = 0; i < length - residual; i += 8) { 87 int32x4_t in32x4_0 = vld1q_s32(p_start); 88 p_start += 4; 89 int32x4_t in32x4_1 = vld1q_s32(p_start); 90 p_start += 4; 91 in32x4_0 = vabsq_s32(in32x4_0); 92 in32x4_1 = vabsq_s32(in32x4_1); 93 // vabs doesn't change the value of 0x80000000. 94 // Use u32 so we don't lose the value 0x80000000. 95 max32x4_0 = vmaxq_u32(max32x4_0, vreinterpretq_u32_s32(in32x4_0)); 96 max32x4_1 = vmaxq_u32(max32x4_1, vreinterpretq_u32_s32(in32x4_1)); 97 } 98 99 uint32x4_t max32x4 = vmaxq_u32(max32x4_0, max32x4_1); 100 #if defined(WEBRTC_ARCH_ARM64) 101 maximum = vmaxvq_u32(max32x4); 102 #else 103 uint32x2_t max32x2 = vmax_u32(vget_low_u32(max32x4), vget_high_u32(max32x4)); 104 max32x2 = vpmax_u32(max32x2, max32x2); 105 106 maximum = vget_lane_u32(max32x2, 0); 107 #endif 108 109 // Second part, do the remaining iterations (if any). 110 for (i = residual; i > 0; i--) { 111 absolute = abs((int)(*p_start)); 112 if (absolute > maximum) { 113 maximum = absolute; 114 } 115 p_start++; 116 } 117 118 // Guard against the case for 0x80000000. 119 maximum = WEBRTC_SPL_MIN(maximum, WEBRTC_SPL_WORD32_MAX); 120 121 return (int32_t)maximum; 122 } 123 124 // Maximum value of word16 vector. NEON intrinsics version for 125 // ARM 32-bit/64-bit platforms. 126 int16_t WebRtcSpl_MaxValueW16Neon(const int16_t* vector, size_t length) { 127 int16_t maximum = WEBRTC_SPL_WORD16_MIN; 128 size_t i = 0; 129 size_t residual = length & 0x7; 130 131 assert(length > 0); 132 133 const int16_t* p_start = vector; 134 int16x8_t max16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MIN); 135 136 // First part, unroll the loop 8 times. 137 for (i = 0; i < length - residual; i += 8) { 138 int16x8_t in16x8 = vld1q_s16(p_start); 139 max16x8 = vmaxq_s16(max16x8, in16x8); 140 p_start += 8; 141 } 142 143 #if defined(WEBRTC_ARCH_ARM64) 144 maximum = vmaxvq_s16(max16x8); 145 #else 146 int16x4_t max16x4 = vmax_s16(vget_low_s16(max16x8), vget_high_s16(max16x8)); 147 max16x4 = vpmax_s16(max16x4, max16x4); 148 max16x4 = vpmax_s16(max16x4, max16x4); 149 150 maximum = vget_lane_s16(max16x4, 0); 151 #endif 152 153 // Second part, do the remaining iterations (if any). 154 for (i = residual; i > 0; i--) { 155 if (*p_start > maximum) 156 maximum = *p_start; 157 p_start++; 158 } 159 return maximum; 160 } 161 162 // Maximum value of word32 vector. NEON intrinsics version for 163 // ARM 32-bit/64-bit platforms. 164 int32_t WebRtcSpl_MaxValueW32Neon(const int32_t* vector, size_t length) { 165 int32_t maximum = WEBRTC_SPL_WORD32_MIN; 166 size_t i = 0; 167 size_t residual = length & 0x7; 168 169 assert(length > 0); 170 171 const int32_t* p_start = vector; 172 int32x4_t max32x4_0 = vdupq_n_s32(WEBRTC_SPL_WORD32_MIN); 173 int32x4_t max32x4_1 = vdupq_n_s32(WEBRTC_SPL_WORD32_MIN); 174 175 // First part, unroll the loop 8 times. 176 for (i = 0; i < length - residual; i += 8) { 177 int32x4_t in32x4_0 = vld1q_s32(p_start); 178 p_start += 4; 179 int32x4_t in32x4_1 = vld1q_s32(p_start); 180 p_start += 4; 181 max32x4_0 = vmaxq_s32(max32x4_0, in32x4_0); 182 max32x4_1 = vmaxq_s32(max32x4_1, in32x4_1); 183 } 184 185 int32x4_t max32x4 = vmaxq_s32(max32x4_0, max32x4_1); 186 #if defined(WEBRTC_ARCH_ARM64) 187 maximum = vmaxvq_s32(max32x4); 188 #else 189 int32x2_t max32x2 = vmax_s32(vget_low_s32(max32x4), vget_high_s32(max32x4)); 190 max32x2 = vpmax_s32(max32x2, max32x2); 191 192 maximum = vget_lane_s32(max32x2, 0); 193 #endif 194 195 // Second part, do the remaining iterations (if any). 196 for (i = residual; i > 0; i--) { 197 if (*p_start > maximum) 198 maximum = *p_start; 199 p_start++; 200 } 201 return maximum; 202 } 203 204 // Minimum value of word16 vector. NEON intrinsics version for 205 // ARM 32-bit/64-bit platforms. 206 int16_t WebRtcSpl_MinValueW16Neon(const int16_t* vector, size_t length) { 207 int16_t minimum = WEBRTC_SPL_WORD16_MAX; 208 size_t i = 0; 209 size_t residual = length & 0x7; 210 211 assert(length > 0); 212 213 const int16_t* p_start = vector; 214 int16x8_t min16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MAX); 215 216 // First part, unroll the loop 8 times. 217 for (i = 0; i < length - residual; i += 8) { 218 int16x8_t in16x8 = vld1q_s16(p_start); 219 min16x8 = vminq_s16(min16x8, in16x8); 220 p_start += 8; 221 } 222 223 #if defined(WEBRTC_ARCH_ARM64) 224 minimum = vminvq_s16(min16x8); 225 #else 226 int16x4_t min16x4 = vmin_s16(vget_low_s16(min16x8), vget_high_s16(min16x8)); 227 min16x4 = vpmin_s16(min16x4, min16x4); 228 min16x4 = vpmin_s16(min16x4, min16x4); 229 230 minimum = vget_lane_s16(min16x4, 0); 231 #endif 232 233 // Second part, do the remaining iterations (if any). 234 for (i = residual; i > 0; i--) { 235 if (*p_start < minimum) 236 minimum = *p_start; 237 p_start++; 238 } 239 return minimum; 240 } 241 242 // Minimum value of word32 vector. NEON intrinsics version for 243 // ARM 32-bit/64-bit platforms. 244 int32_t WebRtcSpl_MinValueW32Neon(const int32_t* vector, size_t length) { 245 int32_t minimum = WEBRTC_SPL_WORD32_MAX; 246 size_t i = 0; 247 size_t residual = length & 0x7; 248 249 assert(length > 0); 250 251 const int32_t* p_start = vector; 252 int32x4_t min32x4_0 = vdupq_n_s32(WEBRTC_SPL_WORD32_MAX); 253 int32x4_t min32x4_1 = vdupq_n_s32(WEBRTC_SPL_WORD32_MAX); 254 255 // First part, unroll the loop 8 times. 256 for (i = 0; i < length - residual; i += 8) { 257 int32x4_t in32x4_0 = vld1q_s32(p_start); 258 p_start += 4; 259 int32x4_t in32x4_1 = vld1q_s32(p_start); 260 p_start += 4; 261 min32x4_0 = vminq_s32(min32x4_0, in32x4_0); 262 min32x4_1 = vminq_s32(min32x4_1, in32x4_1); 263 } 264 265 int32x4_t min32x4 = vminq_s32(min32x4_0, min32x4_1); 266 #if defined(WEBRTC_ARCH_ARM64) 267 minimum = vminvq_s32(min32x4); 268 #else 269 int32x2_t min32x2 = vmin_s32(vget_low_s32(min32x4), vget_high_s32(min32x4)); 270 min32x2 = vpmin_s32(min32x2, min32x2); 271 272 minimum = vget_lane_s32(min32x2, 0); 273 #endif 274 275 // Second part, do the remaining iterations (if any). 276 for (i = residual; i > 0; i--) { 277 if (*p_start < minimum) 278 minimum = *p_start; 279 p_start++; 280 } 281 return minimum; 282 } 283 284