1 /* 2 * Copyright (C) 2016 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_SSE_H 18 #define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_SSE_H 19 20 namespace android { 21 22 // depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h 23 24 #if USE_SSE 25 26 #define TO_STRING2(x) #x 27 #define TO_STRING(x) TO_STRING2(x) 28 // uncomment to print GCC version, may be relevant for intrinsic optimizations 29 /* #pragma message ("GCC version: " TO_STRING(__GNUC__) \ 30 "." TO_STRING(__GNUC_MINOR__) \ 31 "." TO_STRING(__GNUC_PATCHLEVEL__)) */ 32 33 // 34 // SSEx specializations are enabled for Process() and ProcessL() in AudioResamplerFirProcess.h 35 // 36 37 template <int CHANNELS, int STRIDE, bool FIXED> 38 static inline void ProcessSSEIntrinsic(float* out, 39 int count, 40 const float* coefsP, 41 const float* coefsN, 42 const float* sP, 43 const float* sN, 44 const float* volumeLR, 45 float lerpP, 46 const float* coefsP1, 47 const float* coefsN1) 48 { 49 ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8 50 static_assert(CHANNELS == 1 || CHANNELS == 2, "CHANNELS must be 1 or 2"); 51 52 sP -= CHANNELS*(4-1); // adjust sP for a loop iteration of four 53 54 __m128 interp; 55 if (!FIXED) { 56 interp = _mm_set1_ps(lerpP); 57 } 58 59 __m128 accL, accR; 60 accL = _mm_setzero_ps(); 61 if (CHANNELS == 2) { 62 accR = _mm_setzero_ps(); 63 } 64 65 do { 66 __m128 posCoef = _mm_load_ps(coefsP); 67 __m128 negCoef = _mm_load_ps(coefsN); 68 coefsP += 4; 69 coefsN += 4; 70 71 if (!FIXED) { // interpolate 72 __m128 posCoef1 = _mm_load_ps(coefsP1); 73 __m128 negCoef1 = _mm_load_ps(coefsN1); 74 coefsP1 += 4; 75 coefsN1 += 4; 76 77 // Calculate the final coefficient for interpolation 78 // posCoef = interp * (posCoef1 - posCoef) + posCoef 79 // negCoef = interp * (negCoef - negCoef1) + negCoef1 80 posCoef1 = _mm_sub_ps(posCoef1, posCoef); 81 negCoef = _mm_sub_ps(negCoef, negCoef1); 82 83 posCoef1 = _mm_mul_ps(posCoef1, interp); 84 negCoef = _mm_mul_ps(negCoef, interp); 85 86 posCoef = _mm_add_ps(posCoef1, posCoef); 87 negCoef = _mm_add_ps(negCoef, negCoef1); 88 } 89 switch (CHANNELS) { 90 case 1: { 91 __m128 posSamp = _mm_loadu_ps(sP); 92 __m128 negSamp = _mm_loadu_ps(sN); 93 sP -= 4; 94 sN += 4; 95 96 posSamp = _mm_shuffle_ps(posSamp, posSamp, 0x1B); 97 posSamp = _mm_mul_ps(posSamp, posCoef); 98 negSamp = _mm_mul_ps(negSamp, negCoef); 99 100 accL = _mm_add_ps(accL, posSamp); 101 accL = _mm_add_ps(accL, negSamp); 102 } break; 103 case 2: { 104 __m128 posSamp0 = _mm_loadu_ps(sP); 105 __m128 posSamp1 = _mm_loadu_ps(sP+4); 106 __m128 negSamp0 = _mm_loadu_ps(sN); 107 __m128 negSamp1 = _mm_loadu_ps(sN+4); 108 sP -= 8; 109 sN += 8; 110 111 // deinterleave everything and reverse the positives 112 __m128 posSampL = _mm_shuffle_ps(posSamp1, posSamp0, 0x22); 113 __m128 posSampR = _mm_shuffle_ps(posSamp1, posSamp0, 0x77); 114 __m128 negSampL = _mm_shuffle_ps(negSamp0, negSamp1, 0x88); 115 __m128 negSampR = _mm_shuffle_ps(negSamp0, negSamp1, 0xDD); 116 117 posSampL = _mm_mul_ps(posSampL, posCoef); 118 posSampR = _mm_mul_ps(posSampR, posCoef); 119 negSampL = _mm_mul_ps(negSampL, negCoef); 120 negSampR = _mm_mul_ps(negSampR, negCoef); 121 122 accL = _mm_add_ps(accL, posSampL); 123 accR = _mm_add_ps(accR, posSampR); 124 accL = _mm_add_ps(accL, negSampL); 125 accR = _mm_add_ps(accR, negSampR); 126 } break; 127 } 128 } while (count -= 4); 129 130 // multiply by volume and save 131 __m128 vLR = _mm_setzero_ps(); 132 __m128 outSamp; 133 vLR = _mm_loadl_pi(vLR, reinterpret_cast<const __m64*>(volumeLR)); 134 outSamp = _mm_loadl_pi(vLR, reinterpret_cast<__m64*>(out)); 135 136 // combine and funnel down accumulator 137 __m128 outAccum = _mm_setzero_ps(); 138 if (CHANNELS == 1) { 139 // duplicate accL to both L and R 140 outAccum = _mm_add_ps(accL, _mm_movehl_ps(accL, accL)); 141 outAccum = _mm_add_ps(outAccum, _mm_shuffle_ps(outAccum, outAccum, 0x11)); 142 } else if (CHANNELS == 2) { 143 // accR contains R, fold in 144 outAccum = _mm_hadd_ps(accL, accR); 145 outAccum = _mm_hadd_ps(outAccum, outAccum); 146 } 147 148 outAccum = _mm_mul_ps(outAccum, vLR); 149 outSamp = _mm_add_ps(outSamp, outAccum); 150 _mm_storel_pi(reinterpret_cast<__m64*>(out), outSamp); 151 } 152 153 template<> 154 inline void ProcessL<1, 16>(float* const out, 155 int count, 156 const float* coefsP, 157 const float* coefsN, 158 const float* sP, 159 const float* sN, 160 const float* const volumeLR) 161 { 162 ProcessSSEIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR, 163 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/); 164 } 165 166 template<> 167 inline void ProcessL<2, 16>(float* const out, 168 int count, 169 const float* coefsP, 170 const float* coefsN, 171 const float* sP, 172 const float* sN, 173 const float* const volumeLR) 174 { 175 ProcessSSEIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR, 176 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/); 177 } 178 179 template<> 180 inline void Process<1, 16>(float* const out, 181 int count, 182 const float* coefsP, 183 const float* coefsN, 184 const float* coefsP1, 185 const float* coefsN1, 186 const float* sP, 187 const float* sN, 188 float lerpP, 189 const float* const volumeLR) 190 { 191 ProcessSSEIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR, 192 lerpP, coefsP1, coefsN1); 193 } 194 195 template<> 196 inline void Process<2, 16>(float* const out, 197 int count, 198 const float* coefsP, 199 const float* coefsN, 200 const float* coefsP1, 201 const float* coefsN1, 202 const float* sP, 203 const float* sN, 204 float lerpP, 205 const float* const volumeLR) 206 { 207 ProcessSSEIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR, 208 lerpP, coefsP1, coefsN1); 209 } 210 211 #endif //USE_SSE 212 213 } // namespace android 214 215 #endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_SSE_H*/ 216