Home | History | Annotate | Download | only in libaudioprocessing
      1 /*
      2  * Copyright (C) 2016 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_SSE_H
     18 #define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_SSE_H
     19 
     20 namespace android {
     21 
     22 // depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h
     23 
     24 #if USE_SSE
     25 
     26 #define TO_STRING2(x) #x
     27 #define TO_STRING(x) TO_STRING2(x)
     28 // uncomment to print GCC version, may be relevant for intrinsic optimizations
     29 /* #pragma message ("GCC version: " TO_STRING(__GNUC__) \
     30         "." TO_STRING(__GNUC_MINOR__) \
     31         "." TO_STRING(__GNUC_PATCHLEVEL__)) */
     32 
     33 //
     34 // SSEx specializations are enabled for Process() and ProcessL() in AudioResamplerFirProcess.h
     35 //
     36 
     37 template <int CHANNELS, int STRIDE, bool FIXED>
     38 static inline void ProcessSSEIntrinsic(float* out,
     39         int count,
     40         const float* coefsP,
     41         const float* coefsN,
     42         const float* sP,
     43         const float* sN,
     44         const float* volumeLR,
     45         float lerpP,
     46         const float* coefsP1,
     47         const float* coefsN1)
     48 {
     49     ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
     50     static_assert(CHANNELS == 1 || CHANNELS == 2, "CHANNELS must be 1 or 2");
     51 
     52     sP -= CHANNELS*(4-1);   // adjust sP for a loop iteration of four
     53 
     54     __m128 interp;
     55     if (!FIXED) {
     56         interp = _mm_set1_ps(lerpP);
     57     }
     58 
     59     __m128 accL, accR;
     60     accL = _mm_setzero_ps();
     61     if (CHANNELS == 2) {
     62         accR = _mm_setzero_ps();
     63     }
     64 
     65     do {
     66         __m128 posCoef = _mm_load_ps(coefsP);
     67         __m128 negCoef = _mm_load_ps(coefsN);
     68         coefsP += 4;
     69         coefsN += 4;
     70 
     71         if (!FIXED) { // interpolate
     72             __m128 posCoef1 = _mm_load_ps(coefsP1);
     73             __m128 negCoef1 = _mm_load_ps(coefsN1);
     74             coefsP1 += 4;
     75             coefsN1 += 4;
     76 
     77             // Calculate the final coefficient for interpolation
     78             // posCoef = interp * (posCoef1 - posCoef) + posCoef
     79             // negCoef = interp * (negCoef - negCoef1) + negCoef1
     80             posCoef1 = _mm_sub_ps(posCoef1, posCoef);
     81             negCoef = _mm_sub_ps(negCoef, negCoef1);
     82 
     83             posCoef1 = _mm_mul_ps(posCoef1, interp);
     84             negCoef = _mm_mul_ps(negCoef, interp);
     85 
     86             posCoef = _mm_add_ps(posCoef1, posCoef);
     87             negCoef = _mm_add_ps(negCoef, negCoef1);
     88         }
     89         switch (CHANNELS) {
     90         case 1: {
     91             __m128 posSamp = _mm_loadu_ps(sP);
     92             __m128 negSamp = _mm_loadu_ps(sN);
     93             sP -= 4;
     94             sN += 4;
     95 
     96             posSamp = _mm_shuffle_ps(posSamp, posSamp, 0x1B);
     97             posSamp = _mm_mul_ps(posSamp, posCoef);
     98             negSamp = _mm_mul_ps(negSamp, negCoef);
     99 
    100             accL = _mm_add_ps(accL, posSamp);
    101             accL = _mm_add_ps(accL, negSamp);
    102         } break;
    103         case 2: {
    104             __m128 posSamp0 = _mm_loadu_ps(sP);
    105             __m128 posSamp1 = _mm_loadu_ps(sP+4);
    106             __m128 negSamp0 = _mm_loadu_ps(sN);
    107             __m128 negSamp1 = _mm_loadu_ps(sN+4);
    108             sP -= 8;
    109             sN += 8;
    110 
    111             // deinterleave everything and reverse the positives
    112             __m128 posSampL = _mm_shuffle_ps(posSamp1, posSamp0, 0x22);
    113             __m128 posSampR = _mm_shuffle_ps(posSamp1, posSamp0, 0x77);
    114             __m128 negSampL = _mm_shuffle_ps(negSamp0, negSamp1, 0x88);
    115             __m128 negSampR = _mm_shuffle_ps(negSamp0, negSamp1, 0xDD);
    116 
    117             posSampL = _mm_mul_ps(posSampL, posCoef);
    118             posSampR = _mm_mul_ps(posSampR, posCoef);
    119             negSampL = _mm_mul_ps(negSampL, negCoef);
    120             negSampR = _mm_mul_ps(negSampR, negCoef);
    121 
    122             accL = _mm_add_ps(accL, posSampL);
    123             accR = _mm_add_ps(accR, posSampR);
    124             accL = _mm_add_ps(accL, negSampL);
    125             accR = _mm_add_ps(accR, negSampR);
    126         } break;
    127         }
    128     } while (count -= 4);
    129 
    130     // multiply by volume and save
    131     __m128 vLR = _mm_setzero_ps();
    132     __m128 outSamp;
    133     vLR = _mm_loadl_pi(vLR, reinterpret_cast<const __m64*>(volumeLR));
    134     outSamp = _mm_loadl_pi(vLR, reinterpret_cast<__m64*>(out));
    135 
    136     // combine and funnel down accumulator
    137     __m128 outAccum = _mm_setzero_ps();
    138     if (CHANNELS == 1) {
    139         // duplicate accL to both L and R
    140         outAccum = _mm_add_ps(accL, _mm_movehl_ps(accL, accL));
    141         outAccum = _mm_add_ps(outAccum, _mm_shuffle_ps(outAccum, outAccum, 0x11));
    142     } else if (CHANNELS == 2) {
    143         // accR contains R, fold in
    144         outAccum = _mm_hadd_ps(accL, accR);
    145         outAccum = _mm_hadd_ps(outAccum, outAccum);
    146     }
    147 
    148     outAccum = _mm_mul_ps(outAccum, vLR);
    149     outSamp = _mm_add_ps(outSamp, outAccum);
    150     _mm_storel_pi(reinterpret_cast<__m64*>(out), outSamp);
    151 }
    152 
    153 template<>
    154 inline void ProcessL<1, 16>(float* const out,
    155         int count,
    156         const float* coefsP,
    157         const float* coefsN,
    158         const float* sP,
    159         const float* sN,
    160         const float* const volumeLR)
    161 {
    162     ProcessSSEIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
    163             0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
    164 }
    165 
    166 template<>
    167 inline void ProcessL<2, 16>(float* const out,
    168         int count,
    169         const float* coefsP,
    170         const float* coefsN,
    171         const float* sP,
    172         const float* sN,
    173         const float* const volumeLR)
    174 {
    175     ProcessSSEIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
    176             0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
    177 }
    178 
    179 template<>
    180 inline void Process<1, 16>(float* const out,
    181         int count,
    182         const float* coefsP,
    183         const float* coefsN,
    184         const float* coefsP1,
    185         const float* coefsN1,
    186         const float* sP,
    187         const float* sN,
    188         float lerpP,
    189         const float* const volumeLR)
    190 {
    191     ProcessSSEIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
    192             lerpP, coefsP1, coefsN1);
    193 }
    194 
    195 template<>
    196 inline void Process<2, 16>(float* const out,
    197         int count,
    198         const float* coefsP,
    199         const float* coefsN,
    200         const float* coefsP1,
    201         const float* coefsN1,
    202         const float* sP,
    203         const float* sN,
    204         float lerpP,
    205         const float* const volumeLR)
    206 {
    207     ProcessSSEIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
    208             lerpP, coefsP1, coefsN1);
    209 }
    210 
    211 #endif //USE_SSE
    212 
    213 } // namespace android
    214 
    215 #endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_SSE_H*/
    216