Home | History | Annotate | Download | only in audioflinger
      1 /*
      2  * Copyright (C) 2013 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
     18 #define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
     19 
     20 namespace android {
     21 
     22 // depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h
     23 
     24 #if USE_NEON
     25 
     26 // use intrinsics if inline arm32 assembly is not possible
     27 #if !USE_INLINE_ASSEMBLY
     28 #define USE_INTRINSIC
     29 #endif
     30 
     31 // following intrinsics available only on ARM 64 bit ACLE
     32 #ifndef __aarch64__
     33 #undef vld1q_f32_x2
     34 #undef vld1q_s32_x2
     35 #endif
     36 
     37 #define TO_STRING2(x) #x
     38 #define TO_STRING(x) TO_STRING2(x)
     39 // uncomment to print GCC version, may be relevant for intrinsic optimizations
     40 /* #pragma message ("GCC version: " TO_STRING(__GNUC__) \
     41         "." TO_STRING(__GNUC_MINOR__) \
     42         "." TO_STRING(__GNUC_PATCHLEVEL__)) */
     43 
     44 //
     45 // NEON specializations are enabled for Process() and ProcessL() in AudioResamplerFirProcess.h
     46 //
     47 // Two variants are presented here:
     48 // ARM NEON inline assembly which appears up to 10-15% faster than intrinsics (gcc 4.9) for arm32.
     49 // ARM NEON intrinsics which can also be used by arm64 and x86/64 with NEON header.
     50 //
     51 
     52 // Macros to save a mono/stereo accumulator sample in q0 (and q4) as stereo out.
     53 // These are only used for inline assembly.
     54 #define ASSEMBLY_ACCUMULATE_MONO \
     55         "vld1.s32       {d2}, [%[vLR]:64]        \n"/* (1) load volumes */\
     56         "vld1.s32       {d3}, %[out]             \n"/* (2) unaligned load the output */\
     57         "vpadd.s32      d0, d0, d1               \n"/* (1) add all 4 partial sums */\
     58         "vpadd.s32      d0, d0, d0               \n"/* (1+4d) and replicate L/R */\
     59         "vqrdmulh.s32   d0, d0, d2               \n"/* (2+3d) apply volume */\
     60         "vqadd.s32      d3, d3, d0               \n"/* (1+4d) accumulate result (saturating) */\
     61         "vst1.s32       {d3}, %[out]             \n"/* (2+2d) store result */
     62 
     63 #define ASSEMBLY_ACCUMULATE_STEREO \
     64         "vld1.s32       {d2}, [%[vLR]:64]        \n"/* (1) load volumes*/\
     65         "vld1.s32       {d3}, %[out]             \n"/* (2) unaligned load the output*/\
     66         "vpadd.s32      d0, d0, d1               \n"/* (1) add all 4 partial sums from q0*/\
     67         "vpadd.s32      d8, d8, d9               \n"/* (1) add all 4 partial sums from q4*/\
     68         "vpadd.s32      d0, d0, d8               \n"/* (1+4d) combine into L/R*/\
     69         "vqrdmulh.s32   d0, d0, d2               \n"/* (2+3d) apply volume*/\
     70         "vqadd.s32      d3, d3, d0               \n"/* (1+4d) accumulate result (saturating)*/\
     71         "vst1.s32       {d3}, %[out]             \n"/* (2+2d)store result*/
     72 
     73 template <int CHANNELS, int STRIDE, bool FIXED>
     74 static inline void ProcessNeonIntrinsic(int32_t* out,
     75         int count,
     76         const int16_t* coefsP,
     77         const int16_t* coefsN,
     78         const int16_t* sP,
     79         const int16_t* sN,
     80         const int32_t* volumeLR,
     81         uint32_t lerpP,
     82         const int16_t* coefsP1,
     83         const int16_t* coefsN1)
     84 {
     85     ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
     86     COMPILE_TIME_ASSERT_FUNCTION_SCOPE(CHANNELS == 1 || CHANNELS == 2);
     87 
     88     sP -= CHANNELS*((STRIDE>>1)-1);
     89     coefsP = (const int16_t*)__builtin_assume_aligned(coefsP, 16);
     90     coefsN = (const int16_t*)__builtin_assume_aligned(coefsN, 16);
     91 
     92     int16x4_t interp;
     93     if (!FIXED) {
     94         interp = vdup_n_s16(lerpP);
     95         //interp = (int16x4_t)vset_lane_s32 ((int32x2_t)lerpP, interp, 0);
     96         coefsP1 = (const int16_t*)__builtin_assume_aligned(coefsP1, 16);
     97         coefsN1 = (const int16_t*)__builtin_assume_aligned(coefsN1, 16);
     98     }
     99     int32x4_t accum, accum2;
    100     // warning uninitialized if we use veorq_s32
    101     // (alternative to below) accum = veorq_s32(accum, accum);
    102     accum = vdupq_n_s32(0);
    103     if (CHANNELS == 2) {
    104         // (alternative to below) accum2 = veorq_s32(accum2, accum2);
    105         accum2 = vdupq_n_s32(0);
    106     }
    107     do {
    108         int16x8_t posCoef = vld1q_s16(coefsP);
    109         coefsP += 8;
    110         int16x8_t negCoef = vld1q_s16(coefsN);
    111         coefsN += 8;
    112         if (!FIXED) { // interpolate
    113             int16x8_t posCoef1 = vld1q_s16(coefsP1);
    114             coefsP1 += 8;
    115             int16x8_t negCoef1 = vld1q_s16(coefsN1);
    116             coefsN1 += 8;
    117 
    118             posCoef1 = vsubq_s16(posCoef1, posCoef);
    119             negCoef = vsubq_s16(negCoef, negCoef1);
    120 
    121             posCoef1 = vqrdmulhq_lane_s16(posCoef1, interp, 0);
    122             negCoef = vqrdmulhq_lane_s16(negCoef, interp, 0);
    123 
    124             posCoef = vaddq_s16(posCoef, posCoef1);
    125             negCoef = vaddq_s16(negCoef, negCoef1);
    126         }
    127         switch (CHANNELS) {
    128         case 1: {
    129             int16x8_t posSamp = vld1q_s16(sP);
    130             int16x8_t negSamp = vld1q_s16(sN);
    131             sN += 8;
    132             posSamp = vrev64q_s16(posSamp);
    133 
    134             // dot product
    135             accum = vmlal_s16(accum, vget_low_s16(posSamp), vget_high_s16(posCoef)); // reversed
    136             accum = vmlal_s16(accum, vget_high_s16(posSamp), vget_low_s16(posCoef)); // reversed
    137             accum = vmlal_s16(accum, vget_low_s16(negSamp), vget_low_s16(negCoef));
    138             accum = vmlal_s16(accum, vget_high_s16(negSamp), vget_high_s16(negCoef));
    139             sP -= 8;
    140         } break;
    141         case 2: {
    142             int16x8x2_t posSamp = vld2q_s16(sP);
    143             int16x8x2_t negSamp = vld2q_s16(sN);
    144             sN += 16;
    145             posSamp.val[0] = vrev64q_s16(posSamp.val[0]);
    146             posSamp.val[1] = vrev64q_s16(posSamp.val[1]);
    147 
    148             // dot product
    149             accum = vmlal_s16(accum, vget_low_s16(posSamp.val[0]), vget_high_s16(posCoef)); // r
    150             accum = vmlal_s16(accum, vget_high_s16(posSamp.val[0]), vget_low_s16(posCoef)); // r
    151             accum2 = vmlal_s16(accum2, vget_low_s16(posSamp.val[1]), vget_high_s16(posCoef)); // r
    152             accum2 = vmlal_s16(accum2, vget_high_s16(posSamp.val[1]), vget_low_s16(posCoef)); // r
    153             accum = vmlal_s16(accum, vget_low_s16(negSamp.val[0]), vget_low_s16(negCoef));
    154             accum = vmlal_s16(accum, vget_high_s16(negSamp.val[0]), vget_high_s16(negCoef));
    155             accum2 = vmlal_s16(accum2, vget_low_s16(negSamp.val[1]), vget_low_s16(negCoef));
    156             accum2 = vmlal_s16(accum2, vget_high_s16(negSamp.val[1]), vget_high_s16(negCoef));
    157             sP -= 16;
    158         }
    159         } break;
    160     } while (count -= 8);
    161 
    162     // multiply by volume and save
    163     volumeLR = (const int32_t*)__builtin_assume_aligned(volumeLR, 8);
    164     int32x2_t vLR = vld1_s32(volumeLR);
    165     int32x2_t outSamp = vld1_s32(out);
    166     // combine and funnel down accumulator
    167     int32x2_t outAccum = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum));
    168     if (CHANNELS == 1) {
    169         // duplicate accum to both L and R
    170         outAccum = vpadd_s32(outAccum, outAccum);
    171     } else if (CHANNELS == 2) {
    172         // accum2 contains R, fold in
    173         int32x2_t outAccum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2));
    174         outAccum = vpadd_s32(outAccum, outAccum2);
    175     }
    176     outAccum = vqrdmulh_s32(outAccum, vLR);
    177     outSamp = vqadd_s32(outSamp, outAccum);
    178     vst1_s32(out, outSamp);
    179 }
    180 
    181 template <int CHANNELS, int STRIDE, bool FIXED>
    182 static inline void ProcessNeonIntrinsic(int32_t* out,
    183         int count,
    184         const int32_t* coefsP,
    185         const int32_t* coefsN,
    186         const int16_t* sP,
    187         const int16_t* sN,
    188         const int32_t* volumeLR,
    189         uint32_t lerpP,
    190         const int32_t* coefsP1,
    191         const int32_t* coefsN1)
    192 {
    193     ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
    194     COMPILE_TIME_ASSERT_FUNCTION_SCOPE(CHANNELS == 1 || CHANNELS == 2);
    195 
    196     sP -= CHANNELS*((STRIDE>>1)-1);
    197     coefsP = (const int32_t*)__builtin_assume_aligned(coefsP, 16);
    198     coefsN = (const int32_t*)__builtin_assume_aligned(coefsN, 16);
    199 
    200     int32x2_t interp;
    201     if (!FIXED) {
    202         interp = vdup_n_s32(lerpP);
    203         coefsP1 = (const int32_t*)__builtin_assume_aligned(coefsP1, 16);
    204         coefsN1 = (const int32_t*)__builtin_assume_aligned(coefsN1, 16);
    205     }
    206     int32x4_t accum, accum2;
    207     // warning uninitialized if we use veorq_s32
    208     // (alternative to below) accum = veorq_s32(accum, accum);
    209     accum = vdupq_n_s32(0);
    210     if (CHANNELS == 2) {
    211         // (alternative to below) accum2 = veorq_s32(accum2, accum2);
    212         accum2 = vdupq_n_s32(0);
    213     }
    214     do {
    215 #ifdef vld1q_s32_x2
    216         int32x4x2_t posCoef = vld1q_s32_x2(coefsP);
    217         coefsP += 8;
    218         int32x4x2_t negCoef = vld1q_s32_x2(coefsN);
    219         coefsN += 8;
    220 #else
    221         int32x4x2_t posCoef;
    222         posCoef.val[0] = vld1q_s32(coefsP);
    223         coefsP += 4;
    224         posCoef.val[1] = vld1q_s32(coefsP);
    225         coefsP += 4;
    226         int32x4x2_t negCoef;
    227         negCoef.val[0] = vld1q_s32(coefsN);
    228         coefsN += 4;
    229         negCoef.val[1] = vld1q_s32(coefsN);
    230         coefsN += 4;
    231 #endif
    232         if (!FIXED) { // interpolate
    233 #ifdef vld1q_s32_x2
    234             int32x4x2_t posCoef1 = vld1q_s32_x2(coefsP1);
    235             coefsP1 += 8;
    236             int32x4x2_t negCoef1 = vld1q_s32_x2(coefsN1);
    237             coefsN1 += 8;
    238 #else
    239             int32x4x2_t posCoef1;
    240             posCoef1.val[0] = vld1q_s32(coefsP1);
    241             coefsP1 += 4;
    242             posCoef1.val[1] = vld1q_s32(coefsP1);
    243             coefsP1 += 4;
    244             int32x4x2_t negCoef1;
    245             negCoef1.val[0] = vld1q_s32(coefsN1);
    246             coefsN1 += 4;
    247             negCoef1.val[1] = vld1q_s32(coefsN1);
    248             coefsN1 += 4;
    249 #endif
    250 
    251             posCoef1.val[0] = vsubq_s32(posCoef1.val[0], posCoef.val[0]);
    252             posCoef1.val[1] = vsubq_s32(posCoef1.val[1], posCoef.val[1]);
    253             negCoef.val[0] = vsubq_s32(negCoef.val[0], negCoef1.val[0]);
    254             negCoef.val[1] = vsubq_s32(negCoef.val[1], negCoef1.val[1]);
    255 
    256             posCoef1.val[0] = vqrdmulhq_lane_s32(posCoef1.val[0], interp, 0);
    257             posCoef1.val[1] = vqrdmulhq_lane_s32(posCoef1.val[1], interp, 0);
    258             negCoef.val[0] = vqrdmulhq_lane_s32(negCoef.val[0], interp, 0);
    259             negCoef.val[1] = vqrdmulhq_lane_s32(negCoef.val[1], interp, 0);
    260 
    261             posCoef.val[0] = vaddq_s32(posCoef.val[0], posCoef1.val[0]);
    262             posCoef.val[1] = vaddq_s32(posCoef.val[1], posCoef1.val[1]);
    263             negCoef.val[0] = vaddq_s32(negCoef.val[0], negCoef1.val[0]);
    264             negCoef.val[1] = vaddq_s32(negCoef.val[1], negCoef1.val[1]);
    265         }
    266         switch (CHANNELS) {
    267         case 1: {
    268             int16x8_t posSamp = vld1q_s16(sP);
    269             int16x8_t negSamp = vld1q_s16(sN);
    270             sN += 8;
    271             posSamp = vrev64q_s16(posSamp);
    272 
    273             int32x4_t posSamp0 = vshll_n_s16(vget_low_s16(posSamp), 15);
    274             int32x4_t posSamp1 = vshll_n_s16(vget_high_s16(posSamp), 15);
    275             int32x4_t negSamp0 = vshll_n_s16(vget_low_s16(negSamp), 15);
    276             int32x4_t negSamp1 = vshll_n_s16(vget_high_s16(negSamp), 15);
    277 
    278             // dot product
    279             posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed
    280             posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed
    281             negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]);
    282             negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]);
    283 
    284             accum = vaddq_s32(accum, posSamp0);
    285             negSamp0 = vaddq_s32(negSamp0, negSamp1);
    286             accum = vaddq_s32(accum, posSamp1);
    287             accum = vaddq_s32(accum, negSamp0);
    288 
    289             sP -= 8;
    290         } break;
    291         case 2: {
    292             int16x8x2_t posSamp = vld2q_s16(sP);
    293             int16x8x2_t negSamp = vld2q_s16(sN);
    294             sN += 16;
    295             posSamp.val[0] = vrev64q_s16(posSamp.val[0]);
    296             posSamp.val[1] = vrev64q_s16(posSamp.val[1]);
    297 
    298             // left
    299             int32x4_t posSamp0 = vshll_n_s16(vget_low_s16(posSamp.val[0]), 15);
    300             int32x4_t posSamp1 = vshll_n_s16(vget_high_s16(posSamp.val[0]), 15);
    301             int32x4_t negSamp0 = vshll_n_s16(vget_low_s16(negSamp.val[0]), 15);
    302             int32x4_t negSamp1 = vshll_n_s16(vget_high_s16(negSamp.val[0]), 15);
    303 
    304             // dot product
    305             posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed
    306             posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed
    307             negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]);
    308             negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]);
    309 
    310             accum = vaddq_s32(accum, posSamp0);
    311             negSamp0 = vaddq_s32(negSamp0, negSamp1);
    312             accum = vaddq_s32(accum, posSamp1);
    313             accum = vaddq_s32(accum, negSamp0);
    314 
    315             // right
    316             posSamp0 = vshll_n_s16(vget_low_s16(posSamp.val[1]), 15);
    317             posSamp1 = vshll_n_s16(vget_high_s16(posSamp.val[1]), 15);
    318             negSamp0 = vshll_n_s16(vget_low_s16(negSamp.val[1]), 15);
    319             negSamp1 = vshll_n_s16(vget_high_s16(negSamp.val[1]), 15);
    320 
    321             // dot product
    322             posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed
    323             posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed
    324             negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]);
    325             negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]);
    326 
    327             accum2 = vaddq_s32(accum2, posSamp0);
    328             negSamp0 = vaddq_s32(negSamp0, negSamp1);
    329             accum2 = vaddq_s32(accum2, posSamp1);
    330             accum2 = vaddq_s32(accum2, negSamp0);
    331 
    332             sP -= 16;
    333         } break;
    334         }
    335     } while (count -= 8);
    336 
    337     // multiply by volume and save
    338     volumeLR = (const int32_t*)__builtin_assume_aligned(volumeLR, 8);
    339     int32x2_t vLR = vld1_s32(volumeLR);
    340     int32x2_t outSamp = vld1_s32(out);
    341     // combine and funnel down accumulator
    342     int32x2_t outAccum = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum));
    343     if (CHANNELS == 1) {
    344         // duplicate accum to both L and R
    345         outAccum = vpadd_s32(outAccum, outAccum);
    346     } else if (CHANNELS == 2) {
    347         // accum2 contains R, fold in
    348         int32x2_t outAccum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2));
    349         outAccum = vpadd_s32(outAccum, outAccum2);
    350     }
    351     outAccum = vqrdmulh_s32(outAccum, vLR);
    352     outSamp = vqadd_s32(outSamp, outAccum);
    353     vst1_s32(out, outSamp);
    354 }
    355 
    356 template <int CHANNELS, int STRIDE, bool FIXED>
    357 static inline void ProcessNeonIntrinsic(float* out,
    358         int count,
    359         const float* coefsP,
    360         const float* coefsN,
    361         const float* sP,
    362         const float* sN,
    363         const float* volumeLR,
    364         float lerpP,
    365         const float* coefsP1,
    366         const float* coefsN1)
    367 {
    368     ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
    369     COMPILE_TIME_ASSERT_FUNCTION_SCOPE(CHANNELS == 1 || CHANNELS == 2);
    370 
    371     sP -= CHANNELS*((STRIDE>>1)-1);
    372     coefsP = (const float*)__builtin_assume_aligned(coefsP, 16);
    373     coefsN = (const float*)__builtin_assume_aligned(coefsN, 16);
    374 
    375     float32x2_t interp;
    376     if (!FIXED) {
    377         interp = vdup_n_f32(lerpP);
    378         coefsP1 = (const float*)__builtin_assume_aligned(coefsP1, 16);
    379         coefsN1 = (const float*)__builtin_assume_aligned(coefsN1, 16);
    380     }
    381     float32x4_t accum, accum2;
    382     // warning uninitialized if we use veorq_s32
    383     // (alternative to below) accum = veorq_s32(accum, accum);
    384     accum = vdupq_n_f32(0);
    385     if (CHANNELS == 2) {
    386         // (alternative to below) accum2 = veorq_s32(accum2, accum2);
    387         accum2 = vdupq_n_f32(0);
    388     }
    389     do {
    390 #ifdef vld1q_f32_x2
    391         float32x4x2_t posCoef = vld1q_f32_x2(coefsP);
    392         coefsP += 8;
    393         float32x4x2_t negCoef = vld1q_f32_x2(coefsN);
    394         coefsN += 8;
    395 #else
    396         float32x4x2_t posCoef;
    397         posCoef.val[0] = vld1q_f32(coefsP);
    398         coefsP += 4;
    399         posCoef.val[1] = vld1q_f32(coefsP);
    400         coefsP += 4;
    401         float32x4x2_t negCoef;
    402         negCoef.val[0] = vld1q_f32(coefsN);
    403         coefsN += 4;
    404         negCoef.val[1] = vld1q_f32(coefsN);
    405         coefsN += 4;
    406 #endif
    407         if (!FIXED) { // interpolate
    408 #ifdef vld1q_f32_x2
    409             float32x4x2_t posCoef1 = vld1q_f32_x2(coefsP1);
    410             coefsP1 += 8;
    411             float32x4x2_t negCoef1 = vld1q_f32_x2(coefsN1);
    412             coefsN1 += 8;
    413 #else
    414             float32x4x2_t posCoef1;
    415             posCoef1.val[0] = vld1q_f32(coefsP1);
    416             coefsP1 += 4;
    417             posCoef1.val[1] = vld1q_f32(coefsP1);
    418             coefsP1 += 4;
    419             float32x4x2_t negCoef1;
    420             negCoef1.val[0] = vld1q_f32(coefsN1);
    421             coefsN1 += 4;
    422             negCoef1.val[1] = vld1q_f32(coefsN1);
    423             coefsN1 += 4;
    424 #endif
    425             posCoef1.val[0] = vsubq_f32(posCoef1.val[0], posCoef.val[0]);
    426             posCoef1.val[1] = vsubq_f32(posCoef1.val[1], posCoef.val[1]);
    427             negCoef.val[0] = vsubq_f32(negCoef.val[0], negCoef1.val[0]);
    428             negCoef.val[1] = vsubq_f32(negCoef.val[1], negCoef1.val[1]);
    429 
    430             posCoef.val[0] = vmlaq_lane_f32(posCoef.val[0], posCoef1.val[0], interp, 0);
    431             posCoef.val[1] = vmlaq_lane_f32(posCoef.val[1], posCoef1.val[1], interp, 0);
    432             negCoef.val[0] = vmlaq_lane_f32(negCoef1.val[0], negCoef.val[0], interp, 0); // rev
    433             negCoef.val[1] = vmlaq_lane_f32(negCoef1.val[1], negCoef.val[1], interp, 0); // rev
    434         }
    435         switch (CHANNELS) {
    436         case 1: {
    437 #ifdef vld1q_f32_x2
    438             float32x4x2_t posSamp = vld1q_f32_x2(sP);
    439             float32x4x2_t negSamp = vld1q_f32_x2(sN);
    440             sN += 8;
    441             sP -= 8;
    442 #else
    443             float32x4x2_t posSamp;
    444             posSamp.val[0] = vld1q_f32(sP);
    445             sP += 4;
    446             posSamp.val[1] = vld1q_f32(sP);
    447             sP -= 12;
    448             float32x4x2_t negSamp;
    449             negSamp.val[0] = vld1q_f32(sN);
    450             sN += 4;
    451             negSamp.val[1] = vld1q_f32(sN);
    452             sN += 4;
    453 #endif
    454             // effectively we want a vrev128q_f32()
    455             posSamp.val[0] = vrev64q_f32(posSamp.val[0]);
    456             posSamp.val[1] = vrev64q_f32(posSamp.val[1]);
    457             posSamp.val[0] = vcombine_f32(
    458                     vget_high_f32(posSamp.val[0]), vget_low_f32(posSamp.val[0]));
    459             posSamp.val[1] = vcombine_f32(
    460                     vget_high_f32(posSamp.val[1]), vget_low_f32(posSamp.val[1]));
    461 
    462             accum = vmlaq_f32(accum, posSamp.val[0], posCoef.val[1]);
    463             accum = vmlaq_f32(accum, posSamp.val[1], posCoef.val[0]);
    464             accum = vmlaq_f32(accum, negSamp.val[0], negCoef.val[0]);
    465             accum = vmlaq_f32(accum, negSamp.val[1], negCoef.val[1]);
    466         } break;
    467         case 2: {
    468             float32x4x2_t posSamp0 = vld2q_f32(sP);
    469             sP += 8;
    470             float32x4x2_t negSamp0 = vld2q_f32(sN);
    471             sN += 8;
    472             posSamp0.val[0] = vrev64q_f32(posSamp0.val[0]);
    473             posSamp0.val[1] = vrev64q_f32(posSamp0.val[1]);
    474             posSamp0.val[0] = vcombine_f32(
    475                     vget_high_f32(posSamp0.val[0]), vget_low_f32(posSamp0.val[0]));
    476             posSamp0.val[1] = vcombine_f32(
    477                     vget_high_f32(posSamp0.val[1]), vget_low_f32(posSamp0.val[1]));
    478 
    479             float32x4x2_t posSamp1 = vld2q_f32(sP);
    480             sP -= 24;
    481             float32x4x2_t negSamp1 = vld2q_f32(sN);
    482             sN += 8;
    483             posSamp1.val[0] = vrev64q_f32(posSamp1.val[0]);
    484             posSamp1.val[1] = vrev64q_f32(posSamp1.val[1]);
    485             posSamp1.val[0] = vcombine_f32(
    486                     vget_high_f32(posSamp1.val[0]), vget_low_f32(posSamp1.val[0]));
    487             posSamp1.val[1] = vcombine_f32(
    488                     vget_high_f32(posSamp1.val[1]), vget_low_f32(posSamp1.val[1]));
    489 
    490             // Note: speed is affected by accumulation order.
    491             // Also, speed appears slower using vmul/vadd instead of vmla for
    492             // stereo case, comparable for mono.
    493 
    494             accum = vmlaq_f32(accum, negSamp0.val[0], negCoef.val[0]);
    495             accum = vmlaq_f32(accum, negSamp1.val[0], negCoef.val[1]);
    496             accum2 = vmlaq_f32(accum2, negSamp0.val[1], negCoef.val[0]);
    497             accum2 = vmlaq_f32(accum2, negSamp1.val[1], negCoef.val[1]);
    498 
    499             accum = vmlaq_f32(accum, posSamp0.val[0], posCoef.val[1]); // reversed
    500             accum = vmlaq_f32(accum, posSamp1.val[0], posCoef.val[0]); // reversed
    501             accum2 = vmlaq_f32(accum2, posSamp0.val[1], posCoef.val[1]); // reversed
    502             accum2 = vmlaq_f32(accum2, posSamp1.val[1], posCoef.val[0]); // reversed
    503         } break;
    504         }
    505     } while (count -= 8);
    506 
    507     // multiply by volume and save
    508     volumeLR = (const float*)__builtin_assume_aligned(volumeLR, 8);
    509     float32x2_t vLR = vld1_f32(volumeLR);
    510     float32x2_t outSamp = vld1_f32(out);
    511     // combine and funnel down accumulator
    512     float32x2_t outAccum = vpadd_f32(vget_low_f32(accum), vget_high_f32(accum));
    513     if (CHANNELS == 1) {
    514         // duplicate accum to both L and R
    515         outAccum = vpadd_f32(outAccum, outAccum);
    516     } else if (CHANNELS == 2) {
    517         // accum2 contains R, fold in
    518         float32x2_t outAccum2 = vpadd_f32(vget_low_f32(accum2), vget_high_f32(accum2));
    519         outAccum = vpadd_f32(outAccum, outAccum2);
    520     }
    521     outSamp = vmla_f32(outSamp, outAccum, vLR);
    522     vst1_f32(out, outSamp);
    523 }
    524 
    525 template <>
    526 inline void ProcessL<1, 16>(int32_t* const out,
    527         int count,
    528         const int16_t* coefsP,
    529         const int16_t* coefsN,
    530         const int16_t* sP,
    531         const int16_t* sN,
    532         const int32_t* const volumeLR)
    533 {
    534 #ifdef USE_INTRINSIC
    535     ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
    536             0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
    537 #else
    538     const int CHANNELS = 1; // template specialization does not preserve params
    539     const int STRIDE = 16;
    540     sP -= CHANNELS*((STRIDE>>1)-1);
    541     asm (
    542         "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
    543 
    544         "1:                                      \n"
    545 
    546         "vld1.16        {q2}, [%[sP]]            \n"// (2+0d) load 8 16-bits mono samples
    547         "vld1.16        {q3}, [%[sN]]!           \n"// (2) load 8 16-bits mono samples
    548         "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
    549         "vld1.16        {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
    550 
    551         "vrev64.16      q2, q2                   \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
    552 
    553         // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
    554         "vmlal.s16      q0, d4, d17              \n"// (1+0d) multiply (reversed)samples by coef
    555         "vmlal.s16      q0, d5, d16              \n"// (1) multiply (reversed)samples by coef
    556         "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
    557         "vmlal.s16      q0, d7, d21              \n"// (1) multiply neg samples
    558 
    559         // moving these ARM instructions before neon above seems to be slower
    560         "subs           %[count], %[count], #8   \n"// (1) update loop counter
    561         "sub            %[sP], %[sP], #16        \n"// (0) move pointer to next set of samples
    562 
    563         // sP used after branch (warning)
    564         "bne            1b                       \n"// loop
    565 
    566          ASSEMBLY_ACCUMULATE_MONO
    567 
    568         : [out]     "=Uv" (out[0]),
    569           [count]   "+r" (count),
    570           [coefsP0] "+r" (coefsP),
    571           [coefsN0] "+r" (coefsN),
    572           [sP]      "+r" (sP),
    573           [sN]      "+r" (sN)
    574         : [vLR]     "r" (volumeLR)
    575         : "cc", "memory",
    576           "q0", "q1", "q2", "q3",
    577           "q8", "q10"
    578     );
    579 #endif
    580 }
    581 
    582 template <>
    583 inline void ProcessL<2, 16>(int32_t* const out,
    584         int count,
    585         const int16_t* coefsP,
    586         const int16_t* coefsN,
    587         const int16_t* sP,
    588         const int16_t* sN,
    589         const int32_t* const volumeLR)
    590 {
    591 #ifdef USE_INTRINSIC
    592     ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
    593             0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
    594 #else
    595     const int CHANNELS = 2; // template specialization does not preserve params
    596     const int STRIDE = 16;
    597     sP -= CHANNELS*((STRIDE>>1)-1);
    598     asm (
    599         "veor           q0, q0, q0               \n"// (1) acc_L = 0
    600         "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
    601 
    602         "1:                                      \n"
    603 
    604         "vld2.16        {q2, q3}, [%[sP]]        \n"// (3+0d) load 8 16-bits stereo frames
    605         "vld2.16        {q5, q6}, [%[sN]]!       \n"// (3) load 8 16-bits stereo frames
    606         "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
    607         "vld1.16        {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
    608 
    609         "vrev64.16      q2, q2                   \n"// (1) reverse 8 samples of positive left
    610         "vrev64.16      q3, q3                   \n"// (0 combines+) reverse positive right
    611 
    612         "vmlal.s16      q0, d4, d17              \n"// (1) multiply (reversed) samples left
    613         "vmlal.s16      q0, d5, d16              \n"// (1) multiply (reversed) samples left
    614         "vmlal.s16      q4, d6, d17              \n"// (1) multiply (reversed) samples right
    615         "vmlal.s16      q4, d7, d16              \n"// (1) multiply (reversed) samples right
    616         "vmlal.s16      q0, d10, d20             \n"// (1) multiply samples left
    617         "vmlal.s16      q0, d11, d21             \n"// (1) multiply samples left
    618         "vmlal.s16      q4, d12, d20             \n"// (1) multiply samples right
    619         "vmlal.s16      q4, d13, d21             \n"// (1) multiply samples right
    620 
    621         // moving these ARM before neon seems to be slower
    622         "subs           %[count], %[count], #8   \n"// (1) update loop counter
    623         "sub            %[sP], %[sP], #32        \n"// (0) move pointer to next set of samples
    624 
    625         // sP used after branch (warning)
    626         "bne            1b                       \n"// loop
    627 
    628         ASSEMBLY_ACCUMULATE_STEREO
    629 
    630         : [out] "=Uv" (out[0]),
    631           [count] "+r" (count),
    632           [coefsP0] "+r" (coefsP),
    633           [coefsN0] "+r" (coefsN),
    634           [sP] "+r" (sP),
    635           [sN] "+r" (sN)
    636         : [vLR] "r" (volumeLR)
    637         : "cc", "memory",
    638           "q0", "q1", "q2", "q3",
    639           "q4", "q5", "q6",
    640           "q8", "q10"
    641      );
    642 #endif
    643 }
    644 
    645 template <>
    646 inline void Process<1, 16>(int32_t* const out,
    647         int count,
    648         const int16_t* coefsP,
    649         const int16_t* coefsN,
    650         const int16_t* coefsP1,
    651         const int16_t* coefsN1,
    652         const int16_t* sP,
    653         const int16_t* sN,
    654         uint32_t lerpP,
    655         const int32_t* const volumeLR)
    656 {
    657 #ifdef USE_INTRINSIC
    658     ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
    659             lerpP, coefsP1, coefsN1);
    660 #else
    661 
    662     const int CHANNELS = 1; // template specialization does not preserve params
    663     const int STRIDE = 16;
    664     sP -= CHANNELS*((STRIDE>>1)-1);
    665     asm (
    666         "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase S32 Q15
    667         "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
    668 
    669         "1:                                      \n"
    670 
    671         "vld1.16        {q2}, [%[sP]]            \n"// (2+0d) load 8 16-bits mono samples
    672         "vld1.16        {q3}, [%[sN]]!           \n"// (2) load 8 16-bits mono samples
    673         "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
    674         "vld1.16        {q9}, [%[coefsP1]:128]!  \n"// (1) load 8 16-bits coefs for interpolation
    675         "vld1.16        {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
    676         "vld1.16        {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
    677 
    678         "vsub.s16       q9, q9, q8               \n"// (1) interpolate (step1) 1st set of coefs
    679         "vsub.s16       q11, q11, q10            \n"// (1) interpolate (step1) 2nd set of coets
    680 
    681         "vqrdmulh.s16   q9, q9, d2[0]            \n"// (2) interpolate (step2) 1st set of coefs
    682         "vqrdmulh.s16   q11, q11, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
    683 
    684         "vrev64.16      q2, q2                   \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
    685 
    686         "vadd.s16       q8, q8, q9               \n"// (1+2d) interpolate (step3) 1st set
    687         "vadd.s16       q10, q10, q11            \n"// (1+1d) interpolate (step3) 2nd set
    688 
    689         // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
    690         "vmlal.s16      q0, d4, d17              \n"// (1+0d) multiply reversed samples by coef
    691         "vmlal.s16      q0, d5, d16              \n"// (1) multiply reversed samples by coef
    692         "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
    693         "vmlal.s16      q0, d7, d21              \n"// (1) multiply neg samples
    694 
    695         // moving these ARM instructions before neon above seems to be slower
    696         "subs           %[count], %[count], #8   \n"// (1) update loop counter
    697         "sub            %[sP], %[sP], #16        \n"// (0) move pointer to next set of samples
    698 
    699         // sP used after branch (warning)
    700         "bne            1b                       \n"// loop
    701 
    702         ASSEMBLY_ACCUMULATE_MONO
    703 
    704         : [out]     "=Uv" (out[0]),
    705           [count]   "+r" (count),
    706           [coefsP0] "+r" (coefsP),
    707           [coefsN0] "+r" (coefsN),
    708           [coefsP1] "+r" (coefsP1),
    709           [coefsN1] "+r" (coefsN1),
    710           [sP]      "+r" (sP),
    711           [sN]      "+r" (sN)
    712         : [lerpP]   "r" (lerpP),
    713           [vLR]     "r" (volumeLR)
    714         : "cc", "memory",
    715           "q0", "q1", "q2", "q3",
    716           "q8", "q9", "q10", "q11"
    717     );
    718 #endif
    719 }
    720 
    721 template <>
    722 inline void Process<2, 16>(int32_t* const out,
    723         int count,
    724         const int16_t* coefsP,
    725         const int16_t* coefsN,
    726         const int16_t* coefsP1,
    727         const int16_t* coefsN1,
    728         const int16_t* sP,
    729         const int16_t* sN,
    730         uint32_t lerpP,
    731         const int32_t* const volumeLR)
    732 {
    733 #ifdef USE_INTRINSIC
    734     ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
    735             lerpP, coefsP1, coefsN1);
    736 #else
    737     const int CHANNELS = 2; // template specialization does not preserve params
    738     const int STRIDE = 16;
    739     sP -= CHANNELS*((STRIDE>>1)-1);
    740     asm (
    741         "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase
    742         "veor           q0, q0, q0               \n"// (1) acc_L = 0
    743         "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
    744 
    745         "1:                                      \n"
    746 
    747         "vld2.16        {q2, q3}, [%[sP]]        \n"// (3+0d) load 8 16-bits stereo frames
    748         "vld2.16        {q5, q6}, [%[sN]]!       \n"// (3) load 8 16-bits stereo frames
    749         "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
    750         "vld1.16        {q9}, [%[coefsP1]:128]!  \n"// (1) load 8 16-bits coefs for interpolation
    751         "vld1.16        {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
    752         "vld1.16        {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
    753 
    754         "vsub.s16       q9, q9, q8               \n"// (1) interpolate (step1) 1st set of coefs
    755         "vsub.s16       q11, q11, q10            \n"// (1) interpolate (step1) 2nd set of coets
    756 
    757         "vqrdmulh.s16   q9, q9, d2[0]            \n"// (2) interpolate (step2) 1st set of coefs
    758         "vqrdmulh.s16   q11, q11, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
    759 
    760         "vrev64.16      q2, q2                   \n"// (1) reverse 8 samples of positive left
    761         "vrev64.16      q3, q3                   \n"// (1) reverse 8 samples of positive right
    762 
    763         "vadd.s16       q8, q8, q9               \n"// (1+1d) interpolate (step3) 1st set
    764         "vadd.s16       q10, q10, q11            \n"// (1+1d) interpolate (step3) 2nd set
    765 
    766         "vmlal.s16      q0, d4, d17              \n"// (1) multiply reversed samples left
    767         "vmlal.s16      q0, d5, d16              \n"// (1) multiply reversed samples left
    768         "vmlal.s16      q4, d6, d17              \n"// (1) multiply reversed samples right
    769         "vmlal.s16      q4, d7, d16              \n"// (1) multiply reversed samples right
    770         "vmlal.s16      q0, d10, d20             \n"// (1) multiply samples left
    771         "vmlal.s16      q0, d11, d21             \n"// (1) multiply samples left
    772         "vmlal.s16      q4, d12, d20             \n"// (1) multiply samples right
    773         "vmlal.s16      q4, d13, d21             \n"// (1) multiply samples right
    774 
    775         // moving these ARM before neon seems to be slower
    776         "subs           %[count], %[count], #8   \n"// (1) update loop counter
    777         "sub            %[sP], %[sP], #32        \n"// (0) move pointer to next set of samples
    778 
    779         // sP used after branch (warning)
    780         "bne            1b                       \n"// loop
    781 
    782         ASSEMBLY_ACCUMULATE_STEREO
    783 
    784         : [out] "=Uv" (out[0]),
    785           [count] "+r" (count),
    786           [coefsP0] "+r" (coefsP),
    787           [coefsN0] "+r" (coefsN),
    788           [coefsP1] "+r" (coefsP1),
    789           [coefsN1] "+r" (coefsN1),
    790           [sP] "+r" (sP),
    791           [sN] "+r" (sN)
    792         : [lerpP]   "r" (lerpP),
    793           [vLR] "r" (volumeLR)
    794         : "cc", "memory",
    795           "q0", "q1", "q2", "q3",
    796           "q4", "q5", "q6",
    797           "q8", "q9", "q10", "q11"
    798     );
    799 #endif
    800 }
    801 
    802 template <>
    803 inline void ProcessL<1, 16>(int32_t* const out,
    804         int count,
    805         const int32_t* coefsP,
    806         const int32_t* coefsN,
    807         const int16_t* sP,
    808         const int16_t* sN,
    809         const int32_t* const volumeLR)
    810 {
    811 #ifdef USE_INTRINSIC
    812     ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
    813             0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
    814 #else
    815     const int CHANNELS = 1; // template specialization does not preserve params
    816     const int STRIDE = 16;
    817     sP -= CHANNELS*((STRIDE>>1)-1);
    818     asm (
    819         "veor           q0, q0, q0                    \n"// result, initialize to 0
    820 
    821         "1:                                           \n"
    822 
    823         "vld1.16        {q2}, [%[sP]]                 \n"// load 8 16-bits mono samples
    824         "vld1.16        {q3}, [%[sN]]!                \n"// load 8 16-bits mono samples
    825         "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
    826         "vld1.32        {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
    827 
    828         "vrev64.16      q2, q2                        \n"// reverse 8 samples of the positive side
    829 
    830         "vshll.s16      q12, d4, #15                  \n"// extend samples to 31 bits
    831         "vshll.s16      q13, d5, #15                  \n"// extend samples to 31 bits
    832 
    833         "vshll.s16      q14, d6, #15                  \n"// extend samples to 31 bits
    834         "vshll.s16      q15, d7, #15                  \n"// extend samples to 31 bits
    835 
    836         "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples
    837         "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples
    838         "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples
    839         "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples
    840 
    841         "vadd.s32       q0, q0, q12                   \n"// accumulate result
    842         "vadd.s32       q13, q13, q14                 \n"// accumulate result
    843         "vadd.s32       q0, q0, q15                   \n"// accumulate result
    844         "vadd.s32       q0, q0, q13                   \n"// accumulate result
    845 
    846         "sub            %[sP], %[sP], #16             \n"// move pointer to next set of samples
    847         "subs           %[count], %[count], #8        \n"// update loop counter
    848 
    849         "bne            1b                            \n"// loop
    850 
    851         ASSEMBLY_ACCUMULATE_MONO
    852 
    853         : [out]     "=Uv" (out[0]),
    854           [count]   "+r" (count),
    855           [coefsP0] "+r" (coefsP),
    856           [coefsN0] "+r" (coefsN),
    857           [sP]      "+r" (sP),
    858           [sN]      "+r" (sN)
    859         : [vLR]     "r" (volumeLR)
    860         : "cc", "memory",
    861           "q0", "q1", "q2", "q3",
    862           "q8", "q9", "q10", "q11",
    863           "q12", "q13", "q14", "q15"
    864     );
    865 #endif
    866 }
    867 
    868 template <>
    869 inline void ProcessL<2, 16>(int32_t* const out,
    870         int count,
    871         const int32_t* coefsP,
    872         const int32_t* coefsN,
    873         const int16_t* sP,
    874         const int16_t* sN,
    875         const int32_t* const volumeLR)
    876 {
    877 #ifdef USE_INTRINSIC
    878     ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
    879             0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
    880 #else
    881     const int CHANNELS = 2; // template specialization does not preserve params
    882     const int STRIDE = 16;
    883     sP -= CHANNELS*((STRIDE>>1)-1);
    884     asm (
    885         "veor           q0, q0, q0                    \n"// result, initialize to 0
    886         "veor           q4, q4, q4                    \n"// result, initialize to 0
    887 
    888         "1:                                           \n"
    889 
    890         "vld2.16        {q2, q3}, [%[sP]]             \n"// load 8 16-bits stereo frames
    891         "vld2.16        {q5, q6}, [%[sN]]!            \n"// load 8 16-bits stereo frames
    892         "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
    893         "vld1.32        {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
    894 
    895         "vrev64.16      q2, q2                        \n"// reverse 8 samples of positive left
    896         "vrev64.16      q3, q3                        \n"// reverse 8 samples of positive right
    897 
    898         "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
    899         "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits
    900 
    901         "vshll.s16      q14,  d10, #15                \n"// extend samples to 31 bits
    902         "vshll.s16      q15,  d11, #15                \n"// extend samples to 31 bits
    903 
    904         "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by coef
    905         "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by coef
    906         "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by coef
    907         "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by coef
    908 
    909         "vadd.s32       q0, q0, q12                   \n"// accumulate result
    910         "vadd.s32       q13, q13, q14                 \n"// accumulate result
    911         "vadd.s32       q0, q0, q15                   \n"// accumulate result
    912         "vadd.s32       q0, q0, q13                   \n"// accumulate result
    913 
    914         "vshll.s16      q12,  d6, #15                 \n"// extend samples to 31 bits
    915         "vshll.s16      q13,  d7, #15                 \n"// extend samples to 31 bits
    916 
    917         "vshll.s16      q14,  d12, #15                \n"// extend samples to 31 bits
    918         "vshll.s16      q15,  d13, #15                \n"// extend samples to 31 bits
    919 
    920         "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by coef
    921         "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by coef
    922         "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by coef
    923         "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by coef
    924 
    925         "vadd.s32       q4, q4, q12                   \n"// accumulate result
    926         "vadd.s32       q13, q13, q14                 \n"// accumulate result
    927         "vadd.s32       q4, q4, q15                   \n"// accumulate result
    928         "vadd.s32       q4, q4, q13                   \n"// accumulate result
    929 
    930         "subs           %[count], %[count], #8        \n"// update loop counter
    931         "sub            %[sP], %[sP], #32             \n"// move pointer to next set of samples
    932 
    933         "bne            1b                            \n"// loop
    934 
    935         ASSEMBLY_ACCUMULATE_STEREO
    936 
    937         : [out]     "=Uv" (out[0]),
    938           [count]   "+r" (count),
    939           [coefsP0] "+r" (coefsP),
    940           [coefsN0] "+r" (coefsN),
    941           [sP]      "+r" (sP),
    942           [sN]      "+r" (sN)
    943         : [vLR]     "r" (volumeLR)
    944         : "cc", "memory",
    945           "q0", "q1", "q2", "q3",
    946           "q4", "q5", "q6",
    947           "q8", "q9", "q10", "q11",
    948           "q12", "q13", "q14", "q15"
    949     );
    950 #endif
    951 }
    952 
    953 template <>
    954 inline void Process<1, 16>(int32_t* const out,
    955         int count,
    956         const int32_t* coefsP,
    957         const int32_t* coefsN,
    958         const int32_t* coefsP1,
    959         const int32_t* coefsN1,
    960         const int16_t* sP,
    961         const int16_t* sN,
    962         uint32_t lerpP,
    963         const int32_t* const volumeLR)
    964 {
    965 #ifdef USE_INTRINSIC
    966     ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
    967             lerpP, coefsP1, coefsN1);
    968 #else
    969     const int CHANNELS = 1; // template specialization does not preserve params
    970     const int STRIDE = 16;
    971     sP -= CHANNELS*((STRIDE>>1)-1);
    972     asm (
    973         "vmov.32        d2[0], %[lerpP]               \n"// load the positive phase
    974         "veor           q0, q0, q0                    \n"// result, initialize to 0
    975 
    976         "1:                                           \n"
    977 
    978         "vld1.16        {q2}, [%[sP]]                 \n"// load 8 16-bits mono samples
    979         "vld1.16        {q3}, [%[sN]]!                \n"// load 8 16-bits mono samples
    980         "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
    981         "vld1.32        {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
    982         "vld1.32        {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
    983         "vld1.32        {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
    984 
    985         "vsub.s32       q12, q12, q8                  \n"// interpolate (step1)
    986         "vsub.s32       q13, q13, q9                  \n"// interpolate (step1)
    987         "vsub.s32       q14, q14, q10                 \n"// interpolate (step1)
    988         "vsub.s32       q15, q15, q11                 \n"// interpolate (step1)
    989 
    990         "vqrdmulh.s32   q12, q12, d2[0]               \n"// interpolate (step2)
    991         "vqrdmulh.s32   q13, q13, d2[0]               \n"// interpolate (step2)
    992         "vqrdmulh.s32   q14, q14, d2[0]               \n"// interpolate (step2)
    993         "vqrdmulh.s32   q15, q15, d2[0]               \n"// interpolate (step2)
    994 
    995         "vadd.s32       q8, q8, q12                   \n"// interpolate (step3)
    996         "vadd.s32       q9, q9, q13                   \n"// interpolate (step3)
    997         "vadd.s32       q10, q10, q14                 \n"// interpolate (step3)
    998         "vadd.s32       q11, q11, q15                 \n"// interpolate (step3)
    999 
   1000         "vrev64.16      q2, q2                        \n"// reverse 8 samples of the positive side
   1001 
   1002         "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
   1003         "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits
   1004 
   1005         "vshll.s16      q14,  d6, #15                 \n"// extend samples to 31 bits
   1006         "vshll.s16      q15,  d7, #15                 \n"// extend samples to 31 bits
   1007 
   1008         "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
   1009         "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
   1010         "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
   1011         "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
   1012 
   1013         "vadd.s32       q0, q0, q12                   \n"// accumulate result
   1014         "vadd.s32       q13, q13, q14                 \n"// accumulate result
   1015         "vadd.s32       q0, q0, q15                   \n"// accumulate result
   1016         "vadd.s32       q0, q0, q13                   \n"// accumulate result
   1017 
   1018         "sub            %[sP], %[sP], #16             \n"// move pointer to next set of samples
   1019         "subs           %[count], %[count], #8        \n"// update loop counter
   1020 
   1021         "bne            1b                            \n"// loop
   1022 
   1023         ASSEMBLY_ACCUMULATE_MONO
   1024 
   1025         : [out]     "=Uv" (out[0]),
   1026           [count]   "+r" (count),
   1027           [coefsP0] "+r" (coefsP),
   1028           [coefsN0] "+r" (coefsN),
   1029           [coefsP1] "+r" (coefsP1),
   1030           [coefsN1] "+r" (coefsN1),
   1031           [sP]      "+r" (sP),
   1032           [sN]      "+r" (sN)
   1033         : [lerpP]   "r" (lerpP),
   1034           [vLR]     "r" (volumeLR)
   1035         : "cc", "memory",
   1036           "q0", "q1", "q2", "q3",
   1037           "q8", "q9", "q10", "q11",
   1038           "q12", "q13", "q14", "q15"
   1039     );
   1040 #endif
   1041 }
   1042 
   1043 template <>
   1044 inline void Process<2, 16>(int32_t* const out,
   1045         int count,
   1046         const int32_t* coefsP,
   1047         const int32_t* coefsN,
   1048         const int32_t* coefsP1,
   1049         const int32_t* coefsN1,
   1050         const int16_t* sP,
   1051         const int16_t* sN,
   1052         uint32_t lerpP,
   1053         const int32_t* const volumeLR)
   1054 {
   1055 #ifdef USE_INTRINSIC
   1056     ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
   1057             lerpP, coefsP1, coefsN1);
   1058 #else
   1059     const int CHANNELS = 2; // template specialization does not preserve params
   1060     const int STRIDE = 16;
   1061     sP -= CHANNELS*((STRIDE>>1)-1);
   1062     asm (
   1063         "vmov.32        d2[0], %[lerpP]               \n"// load the positive phase
   1064         "veor           q0, q0, q0                    \n"// result, initialize to 0
   1065         "veor           q4, q4, q4                    \n"// result, initialize to 0
   1066 
   1067         "1:                                           \n"
   1068 
   1069         "vld2.16        {q2, q3}, [%[sP]]             \n"// load 8 16-bits stereo frames
   1070         "vld2.16        {q5, q6}, [%[sN]]!            \n"// load 8 16-bits stereo frames
   1071         "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
   1072         "vld1.32        {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
   1073         "vld1.32        {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
   1074         "vld1.32        {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
   1075 
   1076         "vsub.s32       q12, q12, q8                  \n"// interpolate (step1)
   1077         "vsub.s32       q13, q13, q9                  \n"// interpolate (step1)
   1078         "vsub.s32       q14, q14, q10                 \n"// interpolate (step1)
   1079         "vsub.s32       q15, q15, q11                 \n"// interpolate (step1)
   1080 
   1081         "vqrdmulh.s32   q12, q12, d2[0]               \n"// interpolate (step2)
   1082         "vqrdmulh.s32   q13, q13, d2[0]               \n"// interpolate (step2)
   1083         "vqrdmulh.s32   q14, q14, d2[0]               \n"// interpolate (step2)
   1084         "vqrdmulh.s32   q15, q15, d2[0]               \n"// interpolate (step2)
   1085 
   1086         "vadd.s32       q8, q8, q12                   \n"// interpolate (step3)
   1087         "vadd.s32       q9, q9, q13                   \n"// interpolate (step3)
   1088         "vadd.s32       q10, q10, q14                 \n"// interpolate (step3)
   1089         "vadd.s32       q11, q11, q15                 \n"// interpolate (step3)
   1090 
   1091         "vrev64.16      q2, q2                        \n"// reverse 8 samples of positive left
   1092         "vrev64.16      q3, q3                        \n"// reverse 8 samples of positive right
   1093 
   1094         "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
   1095         "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits
   1096 
   1097         "vshll.s16      q14,  d10, #15                \n"// extend samples to 31 bits
   1098         "vshll.s16      q15,  d11, #15                \n"// extend samples to 31 bits
   1099 
   1100         "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
   1101         "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
   1102         "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
   1103         "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
   1104 
   1105         "vadd.s32       q0, q0, q12                   \n"// accumulate result
   1106         "vadd.s32       q13, q13, q14                 \n"// accumulate result
   1107         "vadd.s32       q0, q0, q15                   \n"// accumulate result
   1108         "vadd.s32       q0, q0, q13                   \n"// accumulate result
   1109 
   1110         "vshll.s16      q12,  d6, #15                 \n"// extend samples to 31 bits
   1111         "vshll.s16      q13,  d7, #15                 \n"// extend samples to 31 bits
   1112 
   1113         "vshll.s16      q14,  d12, #15                \n"// extend samples to 31 bits
   1114         "vshll.s16      q15,  d13, #15                \n"// extend samples to 31 bits
   1115 
   1116         "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
   1117         "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
   1118         "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
   1119         "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
   1120 
   1121         "vadd.s32       q4, q4, q12                   \n"// accumulate result
   1122         "vadd.s32       q13, q13, q14                 \n"// accumulate result
   1123         "vadd.s32       q4, q4, q15                   \n"// accumulate result
   1124         "vadd.s32       q4, q4, q13                   \n"// accumulate result
   1125 
   1126         "subs           %[count], %[count], #8        \n"// update loop counter
   1127         "sub            %[sP], %[sP], #32             \n"// move pointer to next set of samples
   1128 
   1129         "bne            1b                            \n"// loop
   1130 
   1131         ASSEMBLY_ACCUMULATE_STEREO
   1132 
   1133         : [out]     "=Uv" (out[0]),
   1134           [count]   "+r" (count),
   1135           [coefsP0] "+r" (coefsP),
   1136           [coefsN0] "+r" (coefsN),
   1137           [coefsP1] "+r" (coefsP1),
   1138           [coefsN1] "+r" (coefsN1),
   1139           [sP]      "+r" (sP),
   1140           [sN]      "+r" (sN)
   1141         : [lerpP]   "r" (lerpP),
   1142           [vLR]     "r" (volumeLR)
   1143         : "cc", "memory",
   1144           "q0", "q1", "q2", "q3",
   1145           "q4", "q5", "q6",
   1146           "q8", "q9", "q10", "q11",
   1147           "q12", "q13", "q14", "q15"
   1148     );
   1149 #endif
   1150 }
   1151 
   1152 template<>
   1153 inline void ProcessL<1, 16>(float* const out,
   1154         int count,
   1155         const float* coefsP,
   1156         const float* coefsN,
   1157         const float* sP,
   1158         const float* sN,
   1159         const float* const volumeLR)
   1160 {
   1161     ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
   1162             0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
   1163 }
   1164 
   1165 template<>
   1166 inline void ProcessL<2, 16>(float* const out,
   1167         int count,
   1168         const float* coefsP,
   1169         const float* coefsN,
   1170         const float* sP,
   1171         const float* sN,
   1172         const float* const volumeLR)
   1173 {
   1174     ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
   1175             0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
   1176 }
   1177 
   1178 template<>
   1179 inline void Process<1, 16>(float* const out,
   1180         int count,
   1181         const float* coefsP,
   1182         const float* coefsN,
   1183         const float* coefsP1,
   1184         const float* coefsN1,
   1185         const float* sP,
   1186         const float* sN,
   1187         float lerpP,
   1188         const float* const volumeLR)
   1189 {
   1190     ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
   1191             lerpP, coefsP1, coefsN1);
   1192 }
   1193 
   1194 template<>
   1195 inline void Process<2, 16>(float* const out,
   1196         int count,
   1197         const float* coefsP,
   1198         const float* coefsN,
   1199         const float* coefsP1,
   1200         const float* coefsN1,
   1201         const float* sP,
   1202         const float* sN,
   1203         float lerpP,
   1204         const float* const volumeLR)
   1205 {
   1206     ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
   1207             lerpP, coefsP1, coefsN1);
   1208 }
   1209 
   1210 #endif //USE_NEON
   1211 
   1212 } // namespace android
   1213 
   1214 #endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H*/
   1215