Home | History | Annotate | Download | only in audioflinger
      1 /*
      2  * Copyright (C) 2013 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
     18 #define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
     19 
     20 namespace android {
     21 
     22 // depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h
     23 
     24 #if USE_NEON
     25 //
     26 // NEON specializations are enabled for Process() and ProcessL()
     27 //
     28 // TODO: Stride 16 and Stride 8 can be combined with one pass stride 8 (if necessary)
     29 // and looping stride 16 (or vice versa). This has some polyphase coef data alignment
     30 // issues with S16 coefs. Consider this later.
     31 
     32 // Macros to save a mono/stereo accumulator sample in q0 (and q4) as stereo out.
     33 #define ASSEMBLY_ACCUMULATE_MONO \
     34         "vld1.s32       {d2}, [%[vLR]:64]        \n"/* (1) load volumes */\
     35         "vld1.s32       {d3}, %[out]             \n"/* (2) unaligned load the output */\
     36         "vpadd.s32      d0, d0, d1               \n"/* (1) add all 4 partial sums */\
     37         "vpadd.s32      d0, d0, d0               \n"/* (1+4d) and replicate L/R */\
     38         "vqrdmulh.s32   d0, d0, d2               \n"/* (2+3d) apply volume */\
     39         "vqadd.s32      d3, d3, d0               \n"/* (1+4d) accumulate result (saturating) */\
     40         "vst1.s32       {d3}, %[out]             \n"/* (2+2d) store result */
     41 
     42 #define ASSEMBLY_ACCUMULATE_STEREO \
     43         "vld1.s32       {d2}, [%[vLR]:64]        \n"/* (1) load volumes*/\
     44         "vld1.s32       {d3}, %[out]             \n"/* (2) unaligned load the output*/\
     45         "vpadd.s32      d0, d0, d1               \n"/* (1) add all 4 partial sums from q0*/\
     46         "vpadd.s32      d8, d8, d9               \n"/* (1) add all 4 partial sums from q4*/\
     47         "vpadd.s32      d0, d0, d8               \n"/* (1+4d) combine into L/R*/\
     48         "vqrdmulh.s32   d0, d0, d2               \n"/* (2+3d) apply volume*/\
     49         "vqadd.s32      d3, d3, d0               \n"/* (1+4d) accumulate result (saturating)*/\
     50         "vst1.s32       {d3}, %[out]             \n"/* (2+2d)store result*/
     51 
     52 template <>
     53 inline void ProcessL<1, 16>(int32_t* const out,
     54         int count,
     55         const int16_t* coefsP,
     56         const int16_t* coefsN,
     57         const int16_t* sP,
     58         const int16_t* sN,
     59         const int32_t* const volumeLR)
     60 {
     61     const int CHANNELS = 1; // template specialization does not preserve params
     62     const int STRIDE = 16;
     63     sP -= CHANNELS*((STRIDE>>1)-1);
     64     asm (
     65         "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
     66 
     67         "1:                                      \n"
     68 
     69         "vld1.16        {q2}, [%[sP]]            \n"// (2+0d) load 8 16-bits mono samples
     70         "vld1.16        {q3}, [%[sN]]!           \n"// (2) load 8 16-bits mono samples
     71         "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
     72         "vld1.16        {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
     73 
     74         "vrev64.16      q2, q2                   \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
     75 
     76         // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
     77         "vmlal.s16      q0, d4, d17              \n"// (1+0d) multiply (reversed)samples by coef
     78         "vmlal.s16      q0, d5, d16              \n"// (1) multiply (reversed)samples by coef
     79         "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
     80         "vmlal.s16      q0, d7, d21              \n"// (1) multiply neg samples
     81 
     82         // moving these ARM instructions before neon above seems to be slower
     83         "subs           %[count], %[count], #8   \n"// (1) update loop counter
     84         "sub            %[sP], %[sP], #16        \n"// (0) move pointer to next set of samples
     85 
     86         // sP used after branch (warning)
     87         "bne            1b                       \n"// loop
     88 
     89          ASSEMBLY_ACCUMULATE_MONO
     90 
     91         : [out]     "=Uv" (out[0]),
     92           [count]   "+r" (count),
     93           [coefsP0] "+r" (coefsP),
     94           [coefsN0] "+r" (coefsN),
     95           [sP]      "+r" (sP),
     96           [sN]      "+r" (sN)
     97         : [vLR]     "r" (volumeLR)
     98         : "cc", "memory",
     99           "q0", "q1", "q2", "q3",
    100           "q8", "q10"
    101     );
    102 }
    103 
    104 template <>
    105 inline void ProcessL<2, 16>(int32_t* const out,
    106         int count,
    107         const int16_t* coefsP,
    108         const int16_t* coefsN,
    109         const int16_t* sP,
    110         const int16_t* sN,
    111         const int32_t* const volumeLR)
    112 {
    113     const int CHANNELS = 2; // template specialization does not preserve params
    114     const int STRIDE = 16;
    115     sP -= CHANNELS*((STRIDE>>1)-1);
    116     asm (
    117         "veor           q0, q0, q0               \n"// (1) acc_L = 0
    118         "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
    119 
    120         "1:                                      \n"
    121 
    122         "vld2.16        {q2, q3}, [%[sP]]        \n"// (3+0d) load 8 16-bits stereo samples
    123         "vld2.16        {q5, q6}, [%[sN]]!       \n"// (3) load 8 16-bits stereo samples
    124         "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
    125         "vld1.16        {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
    126 
    127         "vrev64.16      q2, q2                   \n"// (1) reverse 8 frames of the left positive
    128         "vrev64.16      q3, q3                   \n"// (0 combines+) reverse right positive
    129 
    130         "vmlal.s16      q0, d4, d17              \n"// (1) multiply (reversed) samples left
    131         "vmlal.s16      q0, d5, d16              \n"// (1) multiply (reversed) samples left
    132         "vmlal.s16      q4, d6, d17              \n"// (1) multiply (reversed) samples right
    133         "vmlal.s16      q4, d7, d16              \n"// (1) multiply (reversed) samples right
    134         "vmlal.s16      q0, d10, d20             \n"// (1) multiply samples left
    135         "vmlal.s16      q0, d11, d21             \n"// (1) multiply samples left
    136         "vmlal.s16      q4, d12, d20             \n"// (1) multiply samples right
    137         "vmlal.s16      q4, d13, d21             \n"// (1) multiply samples right
    138 
    139         // moving these ARM before neon seems to be slower
    140         "subs           %[count], %[count], #8   \n"// (1) update loop counter
    141         "sub            %[sP], %[sP], #32        \n"// (0) move pointer to next set of samples
    142 
    143         // sP used after branch (warning)
    144         "bne            1b                       \n"// loop
    145 
    146         ASSEMBLY_ACCUMULATE_STEREO
    147 
    148         : [out] "=Uv" (out[0]),
    149           [count] "+r" (count),
    150           [coefsP0] "+r" (coefsP),
    151           [coefsN0] "+r" (coefsN),
    152           [sP] "+r" (sP),
    153           [sN] "+r" (sN)
    154         : [vLR] "r" (volumeLR)
    155         : "cc", "memory",
    156           "q0", "q1", "q2", "q3",
    157           "q4", "q5", "q6",
    158           "q8", "q10"
    159      );
    160 }
    161 
    162 template <>
    163 inline void Process<1, 16>(int32_t* const out,
    164         int count,
    165         const int16_t* coefsP,
    166         const int16_t* coefsN,
    167         const int16_t* coefsP1,
    168         const int16_t* coefsN1,
    169         const int16_t* sP,
    170         const int16_t* sN,
    171         uint32_t lerpP,
    172         const int32_t* const volumeLR)
    173 {
    174     const int CHANNELS = 1; // template specialization does not preserve params
    175     const int STRIDE = 16;
    176     sP -= CHANNELS*((STRIDE>>1)-1);
    177     asm (
    178         "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase S32 Q15
    179         "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
    180 
    181         "1:                                      \n"
    182 
    183         "vld1.16        {q2}, [%[sP]]            \n"// (2+0d) load 8 16-bits mono samples
    184         "vld1.16        {q3}, [%[sN]]!           \n"// (2) load 8 16-bits mono samples
    185         "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
    186         "vld1.16        {q9}, [%[coefsP1]:128]!  \n"// (1) load 8 16-bits coefs for interpolation
    187         "vld1.16        {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
    188         "vld1.16        {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
    189 
    190         "vsub.s16       q9, q9, q8               \n"// (1) interpolate (step1) 1st set of coefs
    191         "vsub.s16       q11, q11, q10            \n"// (1) interpolate (step1) 2nd set of coets
    192 
    193         "vqrdmulh.s16   q9, q9, d2[0]            \n"// (2) interpolate (step2) 1st set of coefs
    194         "vqrdmulh.s16   q11, q11, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
    195 
    196         "vrev64.16      q2, q2                   \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
    197 
    198         "vadd.s16       q8, q8, q9               \n"// (1+2d) interpolate (step3) 1st set
    199         "vadd.s16       q10, q10, q11            \n"// (1+1d) interpolate (step3) 2nd set
    200 
    201         // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
    202         "vmlal.s16      q0, d4, d17              \n"// (1+0d) multiply reversed samples by coef
    203         "vmlal.s16      q0, d5, d16              \n"// (1) multiply reversed samples by coef
    204         "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
    205         "vmlal.s16      q0, d7, d21              \n"// (1) multiply neg samples
    206 
    207         // moving these ARM instructions before neon above seems to be slower
    208         "subs           %[count], %[count], #8   \n"// (1) update loop counter
    209         "sub            %[sP], %[sP], #16        \n"// (0) move pointer to next set of samples
    210 
    211         // sP used after branch (warning)
    212         "bne            1b                       \n"// loop
    213 
    214         ASSEMBLY_ACCUMULATE_MONO
    215 
    216         : [out]     "=Uv" (out[0]),
    217           [count]   "+r" (count),
    218           [coefsP0] "+r" (coefsP),
    219           [coefsN0] "+r" (coefsN),
    220           [coefsP1] "+r" (coefsP1),
    221           [coefsN1] "+r" (coefsN1),
    222           [sP]      "+r" (sP),
    223           [sN]      "+r" (sN)
    224         : [lerpP]   "r" (lerpP),
    225           [vLR]     "r" (volumeLR)
    226         : "cc", "memory",
    227           "q0", "q1", "q2", "q3",
    228           "q8", "q9", "q10", "q11"
    229     );
    230 }
    231 
    232 template <>
    233 inline void Process<2, 16>(int32_t* const out,
    234         int count,
    235         const int16_t* coefsP,
    236         const int16_t* coefsN,
    237         const int16_t* coefsP1,
    238         const int16_t* coefsN1,
    239         const int16_t* sP,
    240         const int16_t* sN,
    241         uint32_t lerpP,
    242         const int32_t* const volumeLR)
    243 {
    244     const int CHANNELS = 2; // template specialization does not preserve params
    245     const int STRIDE = 16;
    246     sP -= CHANNELS*((STRIDE>>1)-1);
    247     asm (
    248         "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase
    249         "veor           q0, q0, q0               \n"// (1) acc_L = 0
    250         "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
    251 
    252         "1:                                      \n"
    253 
    254         "vld2.16        {q2, q3}, [%[sP]]        \n"// (3+0d) load 8 16-bits stereo samples
    255         "vld2.16        {q5, q6}, [%[sN]]!       \n"// (3) load 8 16-bits stereo samples
    256         "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
    257         "vld1.16        {q9}, [%[coefsP1]:128]!  \n"// (1) load 8 16-bits coefs for interpolation
    258         "vld1.16        {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
    259         "vld1.16        {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
    260 
    261         "vsub.s16       q9, q9, q8               \n"// (1) interpolate (step1) 1st set of coefs
    262         "vsub.s16       q11, q11, q10            \n"// (1) interpolate (step1) 2nd set of coets
    263 
    264         "vqrdmulh.s16   q9, q9, d2[0]            \n"// (2) interpolate (step2) 1st set of coefs
    265         "vqrdmulh.s16   q11, q11, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
    266 
    267         "vrev64.16      q2, q2                   \n"// (1) reverse 8 frames of the left positive
    268         "vrev64.16      q3, q3                   \n"// (1) reverse 8 frames of the right positive
    269 
    270         "vadd.s16       q8, q8, q9               \n"// (1+1d) interpolate (step3) 1st set
    271         "vadd.s16       q10, q10, q11            \n"// (1+1d) interpolate (step3) 2nd set
    272 
    273         "vmlal.s16      q0, d4, d17              \n"// (1) multiply reversed samples left
    274         "vmlal.s16      q0, d5, d16              \n"// (1) multiply reversed samples left
    275         "vmlal.s16      q4, d6, d17              \n"// (1) multiply reversed samples right
    276         "vmlal.s16      q4, d7, d16              \n"// (1) multiply reversed samples right
    277         "vmlal.s16      q0, d10, d20             \n"// (1) multiply samples left
    278         "vmlal.s16      q0, d11, d21             \n"// (1) multiply samples left
    279         "vmlal.s16      q4, d12, d20             \n"// (1) multiply samples right
    280         "vmlal.s16      q4, d13, d21             \n"// (1) multiply samples right
    281 
    282         // moving these ARM before neon seems to be slower
    283         "subs           %[count], %[count], #8   \n"// (1) update loop counter
    284         "sub            %[sP], %[sP], #32        \n"// (0) move pointer to next set of samples
    285 
    286         // sP used after branch (warning)
    287         "bne            1b                       \n"// loop
    288 
    289         ASSEMBLY_ACCUMULATE_STEREO
    290 
    291         : [out] "=Uv" (out[0]),
    292           [count] "+r" (count),
    293           [coefsP0] "+r" (coefsP),
    294           [coefsN0] "+r" (coefsN),
    295           [coefsP1] "+r" (coefsP1),
    296           [coefsN1] "+r" (coefsN1),
    297           [sP] "+r" (sP),
    298           [sN] "+r" (sN)
    299         : [lerpP]   "r" (lerpP),
    300           [vLR] "r" (volumeLR)
    301         : "cc", "memory",
    302           "q0", "q1", "q2", "q3",
    303           "q4", "q5", "q6",
    304           "q8", "q9", "q10", "q11"
    305     );
    306 }
    307 
    308 template <>
    309 inline void ProcessL<1, 16>(int32_t* const out,
    310         int count,
    311         const int32_t* coefsP,
    312         const int32_t* coefsN,
    313         const int16_t* sP,
    314         const int16_t* sN,
    315         const int32_t* const volumeLR)
    316 {
    317     const int CHANNELS = 1; // template specialization does not preserve params
    318     const int STRIDE = 16;
    319     sP -= CHANNELS*((STRIDE>>1)-1);
    320     asm (
    321         "veor           q0, q0, q0                    \n"// result, initialize to 0
    322 
    323         "1:                                           \n"
    324 
    325         "vld1.16        {q2}, [%[sP]]                 \n"// load 8 16-bits mono samples
    326         "vld1.16        {q3}, [%[sN]]!                \n"// load 8 16-bits mono samples
    327         "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
    328         "vld1.32        {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
    329 
    330         "vrev64.16      q2, q2                        \n"// reverse 8 frames of the positive side
    331 
    332         "vshll.s16      q12, d4, #15                  \n"// extend samples to 31 bits
    333         "vshll.s16      q13, d5, #15                  \n"// extend samples to 31 bits
    334 
    335         "vshll.s16      q14, d6, #15                  \n"// extend samples to 31 bits
    336         "vshll.s16      q15, d7, #15                  \n"// extend samples to 31 bits
    337 
    338         "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
    339         "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
    340         "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
    341         "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
    342 
    343         "vadd.s32       q0, q0, q12                   \n"// accumulate result
    344         "vadd.s32       q13, q13, q14                 \n"// accumulate result
    345         "vadd.s32       q0, q0, q15                   \n"// accumulate result
    346         "vadd.s32       q0, q0, q13                   \n"// accumulate result
    347 
    348         "sub            %[sP], %[sP], #16             \n"// move pointer to next set of samples
    349         "subs           %[count], %[count], #8        \n"// update loop counter
    350 
    351         "bne            1b                            \n"// loop
    352 
    353         ASSEMBLY_ACCUMULATE_MONO
    354 
    355         : [out]     "=Uv" (out[0]),
    356           [count]   "+r" (count),
    357           [coefsP0] "+r" (coefsP),
    358           [coefsN0] "+r" (coefsN),
    359           [sP]      "+r" (sP),
    360           [sN]      "+r" (sN)
    361         : [vLR]     "r" (volumeLR)
    362         : "cc", "memory",
    363           "q0", "q1", "q2", "q3",
    364           "q8", "q9", "q10", "q11",
    365           "q12", "q13", "q14", "q15"
    366     );
    367 }
    368 
    369 template <>
    370 inline void ProcessL<2, 16>(int32_t* const out,
    371         int count,
    372         const int32_t* coefsP,
    373         const int32_t* coefsN,
    374         const int16_t* sP,
    375         const int16_t* sN,
    376         const int32_t* const volumeLR)
    377 {
    378     const int CHANNELS = 2; // template specialization does not preserve params
    379     const int STRIDE = 16;
    380     sP -= CHANNELS*((STRIDE>>1)-1);
    381     asm (
    382         "veor           q0, q0, q0                    \n"// result, initialize to 0
    383         "veor           q4, q4, q4                    \n"// result, initialize to 0
    384 
    385         "1:                                           \n"
    386 
    387         "vld2.16        {q2, q3}, [%[sP]]             \n"// load 4 16-bits stereo samples
    388         "vld2.16        {q5, q6}, [%[sN]]!            \n"// load 4 16-bits stereo samples
    389         "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 4 32-bits coefs
    390         "vld1.32        {q10, q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs
    391 
    392         "vrev64.16      q2, q2                        \n"// reverse 8 frames of the positive side
    393         "vrev64.16      q3, q3                        \n"// reverse 8 frames of the positive side
    394 
    395         "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
    396         "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits
    397 
    398         "vshll.s16      q14,  d10, #15                \n"// extend samples to 31 bits
    399         "vshll.s16      q15,  d11, #15                \n"// extend samples to 31 bits
    400 
    401         "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
    402         "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
    403         "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
    404         "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
    405 
    406         "vadd.s32       q0, q0, q12                   \n"// accumulate result
    407         "vadd.s32       q13, q13, q14                 \n"// accumulate result
    408         "vadd.s32       q0, q0, q15                   \n"// (+1) accumulate result
    409         "vadd.s32       q0, q0, q13                   \n"// (+1) accumulate result
    410 
    411         "vshll.s16      q12,  d6, #15                 \n"// extend samples to 31 bits
    412         "vshll.s16      q13,  d7, #15                 \n"// extend samples to 31 bits
    413 
    414         "vshll.s16      q14,  d12, #15                \n"// extend samples to 31 bits
    415         "vshll.s16      q15,  d13, #15                \n"// extend samples to 31 bits
    416 
    417         "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
    418         "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
    419         "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
    420         "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
    421 
    422         "vadd.s32       q4, q4, q12                   \n"// accumulate result
    423         "vadd.s32       q13, q13, q14                 \n"// accumulate result
    424         "vadd.s32       q4, q4, q15                   \n"// (+1) accumulate result
    425         "vadd.s32       q4, q4, q13                   \n"// (+1) accumulate result
    426 
    427         "subs           %[count], %[count], #8        \n"// update loop counter
    428         "sub            %[sP], %[sP], #32             \n"// move pointer to next set of samples
    429 
    430         "bne            1b                            \n"// loop
    431 
    432         ASSEMBLY_ACCUMULATE_STEREO
    433 
    434         : [out]     "=Uv" (out[0]),
    435           [count]   "+r" (count),
    436           [coefsP0] "+r" (coefsP),
    437           [coefsN0] "+r" (coefsN),
    438           [sP]      "+r" (sP),
    439           [sN]      "+r" (sN)
    440         : [vLR]     "r" (volumeLR)
    441         : "cc", "memory",
    442           "q0", "q1", "q2", "q3",
    443           "q4", "q5", "q6",
    444           "q8", "q9", "q10", "q11",
    445           "q12", "q13", "q14", "q15"
    446     );
    447 }
    448 
    449 template <>
    450 inline void Process<1, 16>(int32_t* const out,
    451         int count,
    452         const int32_t* coefsP,
    453         const int32_t* coefsN,
    454         const int32_t* coefsP1,
    455         const int32_t* coefsN1,
    456         const int16_t* sP,
    457         const int16_t* sN,
    458         uint32_t lerpP,
    459         const int32_t* const volumeLR)
    460 {
    461     const int CHANNELS = 1; // template specialization does not preserve params
    462     const int STRIDE = 16;
    463     sP -= CHANNELS*((STRIDE>>1)-1);
    464     asm (
    465         "vmov.32        d2[0], %[lerpP]               \n"// load the positive phase
    466         "veor           q0, q0, q0                    \n"// result, initialize to 0
    467 
    468         "1:                                           \n"
    469 
    470         "vld1.16        {q2}, [%[sP]]                 \n"// load 8 16-bits mono samples
    471         "vld1.16        {q3}, [%[sN]]!                \n"// load 8 16-bits mono samples
    472         "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
    473         "vld1.32        {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
    474         "vld1.32        {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
    475         "vld1.32        {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
    476 
    477         "vsub.s32       q12, q12, q8                  \n"// interpolate (step1)
    478         "vsub.s32       q13, q13, q9                  \n"// interpolate (step1)
    479         "vsub.s32       q14, q14, q10                 \n"// interpolate (step1)
    480         "vsub.s32       q15, q15, q11                 \n"// interpolate (step1)
    481 
    482         "vqrdmulh.s32   q12, q12, d2[0]               \n"// interpolate (step2)
    483         "vqrdmulh.s32   q13, q13, d2[0]               \n"// interpolate (step2)
    484         "vqrdmulh.s32   q14, q14, d2[0]               \n"// interpolate (step2)
    485         "vqrdmulh.s32   q15, q15, d2[0]               \n"// interpolate (step2)
    486 
    487         "vadd.s32       q8, q8, q12                   \n"// interpolate (step3)
    488         "vadd.s32       q9, q9, q13                   \n"// interpolate (step3)
    489         "vadd.s32       q10, q10, q14                 \n"// interpolate (step3)
    490         "vadd.s32       q11, q11, q15                 \n"// interpolate (step3)
    491 
    492         "vrev64.16      q2, q2                        \n"// reverse 8 frames of the positive side
    493 
    494         "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
    495         "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits
    496 
    497         "vshll.s16      q14,  d6, #15                 \n"// extend samples to 31 bits
    498         "vshll.s16      q15,  d7, #15                 \n"// extend samples to 31 bits
    499 
    500         "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
    501         "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
    502         "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
    503         "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
    504 
    505         "vadd.s32       q0, q0, q12                   \n"// accumulate result
    506         "vadd.s32       q13, q13, q14                 \n"// accumulate result
    507         "vadd.s32       q0, q0, q15                   \n"// accumulate result
    508         "vadd.s32       q0, q0, q13                   \n"// accumulate result
    509 
    510         "sub            %[sP], %[sP], #16             \n"// move pointer to next set of samples
    511         "subs           %[count], %[count], #8        \n"// update loop counter
    512 
    513         "bne            1b                            \n"// loop
    514 
    515         ASSEMBLY_ACCUMULATE_MONO
    516 
    517         : [out]     "=Uv" (out[0]),
    518           [count]   "+r" (count),
    519           [coefsP0] "+r" (coefsP),
    520           [coefsN0] "+r" (coefsN),
    521           [coefsP1] "+r" (coefsP1),
    522           [coefsN1] "+r" (coefsN1),
    523           [sP]      "+r" (sP),
    524           [sN]      "+r" (sN)
    525         : [lerpP]   "r" (lerpP),
    526           [vLR]     "r" (volumeLR)
    527         : "cc", "memory",
    528           "q0", "q1", "q2", "q3",
    529           "q8", "q9", "q10", "q11",
    530           "q12", "q13", "q14", "q15"
    531     );
    532 }
    533 
    534 template <>
    535 inline void Process<2, 16>(int32_t* const out,
    536         int count,
    537         const int32_t* coefsP,
    538         const int32_t* coefsN,
    539         const int32_t* coefsP1,
    540         const int32_t* coefsN1,
    541         const int16_t* sP,
    542         const int16_t* sN,
    543         uint32_t lerpP,
    544         const int32_t* const volumeLR)
    545 {
    546     const int CHANNELS = 2; // template specialization does not preserve params
    547     const int STRIDE = 16;
    548     sP -= CHANNELS*((STRIDE>>1)-1);
    549     asm (
    550         "vmov.32        d2[0], %[lerpP]               \n"// load the positive phase
    551         "veor           q0, q0, q0                    \n"// result, initialize to 0
    552         "veor           q4, q4, q4                    \n"// result, initialize to 0
    553 
    554         "1:                                           \n"
    555 
    556         "vld2.16        {q2, q3}, [%[sP]]             \n"// load 4 16-bits stereo samples
    557         "vld2.16        {q5, q6}, [%[sN]]!            \n"// load 4 16-bits stereo samples
    558         "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
    559         "vld1.32        {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
    560         "vld1.32        {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
    561         "vld1.32        {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
    562 
    563         "vsub.s32       q12, q12, q8                  \n"// interpolate (step1)
    564         "vsub.s32       q13, q13, q9                  \n"// interpolate (step1)
    565         "vsub.s32       q14, q14, q10                 \n"// interpolate (step1)
    566         "vsub.s32       q15, q15, q11                 \n"// interpolate (step1)
    567 
    568         "vqrdmulh.s32   q12, q12, d2[0]               \n"// interpolate (step2)
    569         "vqrdmulh.s32   q13, q13, d2[0]               \n"// interpolate (step2)
    570         "vqrdmulh.s32   q14, q14, d2[0]               \n"// interpolate (step2)
    571         "vqrdmulh.s32   q15, q15, d2[0]               \n"// interpolate (step2)
    572 
    573         "vadd.s32       q8, q8, q12                   \n"// interpolate (step3)
    574         "vadd.s32       q9, q9, q13                   \n"// interpolate (step3)
    575         "vadd.s32       q10, q10, q14                 \n"// interpolate (step3)
    576         "vadd.s32       q11, q11, q15                 \n"// interpolate (step3)
    577 
    578         "vrev64.16      q2, q2                        \n"// reverse 8 frames of the positive side
    579         "vrev64.16      q3, q3                        \n"// reverse 8 frames of the positive side
    580 
    581         "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
    582         "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits
    583 
    584         "vshll.s16      q14,  d10, #15                \n"// extend samples to 31 bits
    585         "vshll.s16      q15,  d11, #15                \n"// extend samples to 31 bits
    586 
    587         "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
    588         "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
    589         "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
    590         "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
    591 
    592         "vadd.s32       q0, q0, q12                   \n"// accumulate result
    593         "vadd.s32       q13, q13, q14                 \n"// accumulate result
    594         "vadd.s32       q0, q0, q15                   \n"// (+1) accumulate result
    595         "vadd.s32       q0, q0, q13                   \n"// (+1) accumulate result
    596 
    597         "vshll.s16      q12,  d6, #15                 \n"// extend samples to 31 bits
    598         "vshll.s16      q13,  d7, #15                 \n"// extend samples to 31 bits
    599 
    600         "vshll.s16      q14,  d12, #15                \n"// extend samples to 31 bits
    601         "vshll.s16      q15,  d13, #15                \n"// extend samples to 31 bits
    602 
    603         "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
    604         "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
    605         "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
    606         "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
    607 
    608         "vadd.s32       q4, q4, q12                   \n"// accumulate result
    609         "vadd.s32       q13, q13, q14                 \n"// accumulate result
    610         "vadd.s32       q4, q4, q15                   \n"// (+1) accumulate result
    611         "vadd.s32       q4, q4, q13                   \n"// (+1) accumulate result
    612 
    613         "subs           %[count], %[count], #8        \n"// update loop counter
    614         "sub            %[sP], %[sP], #32             \n"// move pointer to next set of samples
    615 
    616         "bne            1b                            \n"// loop
    617 
    618         ASSEMBLY_ACCUMULATE_STEREO
    619 
    620         : [out]     "=Uv" (out[0]),
    621           [count]   "+r" (count),
    622           [coefsP0] "+r" (coefsP),
    623           [coefsN0] "+r" (coefsN),
    624           [coefsP1] "+r" (coefsP1),
    625           [coefsN1] "+r" (coefsN1),
    626           [sP]      "+r" (sP),
    627           [sN]      "+r" (sN)
    628         : [lerpP]   "r" (lerpP),
    629           [vLR]     "r" (volumeLR)
    630         : "cc", "memory",
    631           "q0", "q1", "q2", "q3",
    632           "q4", "q5", "q6",
    633           "q8", "q9", "q10", "q11",
    634           "q12", "q13", "q14", "q15"
    635     );
    636 }
    637 
    638 template <>
    639 inline void ProcessL<1, 8>(int32_t* const out,
    640         int count,
    641         const int16_t* coefsP,
    642         const int16_t* coefsN,
    643         const int16_t* sP,
    644         const int16_t* sN,
    645         const int32_t* const volumeLR)
    646 {
    647     const int CHANNELS = 1; // template specialization does not preserve params
    648     const int STRIDE = 8;
    649     sP -= CHANNELS*((STRIDE>>1)-1);
    650     asm (
    651         "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
    652 
    653         "1:                                      \n"
    654 
    655         "vld1.16        {d4}, [%[sP]]            \n"// (2+0d) load 4 16-bits mono samples
    656         "vld1.16        {d6}, [%[sN]]!           \n"// (2) load 4 16-bits mono samples
    657         "vld1.16        {d16}, [%[coefsP0]:64]!  \n"// (1) load 4 16-bits coefs
    658         "vld1.16        {d20}, [%[coefsN0]:64]!  \n"// (1) load 4 16-bits coefs
    659 
    660         "vrev64.16      d4, d4                   \n"// (1) reversed s3, s2, s1, s0, s7, s6, s5, s4
    661 
    662         // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
    663         "vmlal.s16      q0, d4, d16              \n"// (1) multiply (reversed)samples by coef
    664         "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
    665 
    666         // moving these ARM instructions before neon above seems to be slower
    667         "subs           %[count], %[count], #4   \n"// (1) update loop counter
    668         "sub            %[sP], %[sP], #8         \n"// (0) move pointer to next set of samples
    669 
    670         // sP used after branch (warning)
    671         "bne            1b                       \n"// loop
    672 
    673         ASSEMBLY_ACCUMULATE_MONO
    674 
    675         : [out]     "=Uv" (out[0]),
    676           [count]   "+r" (count),
    677           [coefsP0] "+r" (coefsP),
    678           [coefsN0] "+r" (coefsN),
    679           [sP]      "+r" (sP),
    680           [sN]      "+r" (sN)
    681         : [vLR]     "r" (volumeLR)
    682         : "cc", "memory",
    683           "q0", "q1", "q2", "q3",
    684           "q8", "q10"
    685     );
    686 }
    687 
    688 template <>
    689 inline void ProcessL<2, 8>(int32_t* const out,
    690         int count,
    691         const int16_t* coefsP,
    692         const int16_t* coefsN,
    693         const int16_t* sP,
    694         const int16_t* sN,
    695         const int32_t* const volumeLR)
    696 {
    697     const int CHANNELS = 2; // template specialization does not preserve params
    698     const int STRIDE = 8;
    699     sP -= CHANNELS*((STRIDE>>1)-1);
    700     asm (
    701         "veor           q0, q0, q0               \n"// (1) acc_L = 0
    702         "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
    703 
    704         "1:                                      \n"
    705 
    706         "vld2.16        {d4, d5}, [%[sP]]        \n"// (2+0d) load 8 16-bits stereo samples
    707         "vld2.16        {d6, d7}, [%[sN]]!       \n"// (2) load 8 16-bits stereo samples
    708         "vld1.16        {d16}, [%[coefsP0]:64]!  \n"// (1) load 8 16-bits coefs
    709         "vld1.16        {d20}, [%[coefsN0]:64]!  \n"// (1) load 8 16-bits coefs
    710 
    711         "vrev64.16      q2, q2                   \n"// (1) reverse 8 frames of the left positive
    712 
    713         "vmlal.s16      q0, d4, d16              \n"// (1) multiply (reversed) samples left
    714         "vmlal.s16      q4, d5, d16              \n"// (1) multiply (reversed) samples right
    715         "vmlal.s16      q0, d6, d20              \n"// (1) multiply samples left
    716         "vmlal.s16      q4, d7, d20              \n"// (1) multiply samples right
    717 
    718         // moving these ARM before neon seems to be slower
    719         "subs           %[count], %[count], #4   \n"// (1) update loop counter
    720         "sub            %[sP], %[sP], #16        \n"// (0) move pointer to next set of samples
    721 
    722         // sP used after branch (warning)
    723         "bne            1b                       \n"// loop
    724 
    725         ASSEMBLY_ACCUMULATE_STEREO
    726 
    727         : [out] "=Uv" (out[0]),
    728           [count] "+r" (count),
    729           [coefsP0] "+r" (coefsP),
    730           [coefsN0] "+r" (coefsN),
    731           [sP] "+r" (sP),
    732           [sN] "+r" (sN)
    733         : [vLR] "r" (volumeLR)
    734         : "cc", "memory",
    735           "q0", "q1", "q2", "q3",
    736           "q4", "q5", "q6",
    737           "q8", "q10"
    738      );
    739 }
    740 
    741 template <>
    742 inline void Process<1, 8>(int32_t* const out,
    743         int count,
    744         const int16_t* coefsP,
    745         const int16_t* coefsN,
    746         const int16_t* coefsP1,
    747         const int16_t* coefsN1,
    748         const int16_t* sP,
    749         const int16_t* sN,
    750         uint32_t lerpP,
    751         const int32_t* const volumeLR)
    752 {
    753     const int CHANNELS = 1; // template specialization does not preserve params
    754     const int STRIDE = 8;
    755     sP -= CHANNELS*((STRIDE>>1)-1);
    756     asm (
    757         "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase S32 Q15
    758         "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
    759 
    760         "1:                                      \n"
    761 
    762         "vld1.16        {d4}, [%[sP]]            \n"// (2+0d) load 4 16-bits mono samples
    763         "vld1.16        {d6}, [%[sN]]!           \n"// (2) load 4 16-bits mono samples
    764         "vld1.16        {d16}, [%[coefsP0]:64]!  \n"// (1) load 4 16-bits coefs
    765         "vld1.16        {d17}, [%[coefsP1]:64]!  \n"// (1) load 4 16-bits coefs for interpolation
    766         "vld1.16        {d20}, [%[coefsN1]:64]!  \n"// (1) load 4 16-bits coefs
    767         "vld1.16        {d21}, [%[coefsN0]:64]!  \n"// (1) load 4 16-bits coefs for interpolation
    768 
    769         "vsub.s16       d17, d17, d16            \n"// (1) interpolate (step1) 1st set of coefs
    770         "vsub.s16       d21, d21, d20            \n"// (1) interpolate (step1) 2nd set of coets
    771 
    772         "vqrdmulh.s16   d17, d17, d2[0]          \n"// (2) interpolate (step2) 1st set of coefs
    773         "vqrdmulh.s16   d21, d21, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
    774 
    775         "vrev64.16      d4, d4                   \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
    776 
    777         "vadd.s16       d16, d16, d17            \n"// (1+2d) interpolate (step3) 1st set
    778         "vadd.s16       d20, d20, d21            \n"// (1+1d) interpolate (step3) 2nd set
    779 
    780         // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
    781         "vmlal.s16      q0, d4, d16              \n"// (1+0d) multiply (reversed)by coef
    782         "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
    783 
    784         // moving these ARM instructions before neon above seems to be slower
    785         "subs           %[count], %[count], #4   \n"// (1) update loop counter
    786         "sub            %[sP], %[sP], #8        \n"// move pointer to next set of samples
    787 
    788         // sP used after branch (warning)
    789         "bne            1b                       \n"// loop
    790 
    791         ASSEMBLY_ACCUMULATE_MONO
    792 
    793         : [out]     "=Uv" (out[0]),
    794           [count]   "+r" (count),
    795           [coefsP0] "+r" (coefsP),
    796           [coefsN0] "+r" (coefsN),
    797           [coefsP1] "+r" (coefsP1),
    798           [coefsN1] "+r" (coefsN1),
    799           [sP]      "+r" (sP),
    800           [sN]      "+r" (sN)
    801         : [lerpP]   "r" (lerpP),
    802           [vLR]     "r" (volumeLR)
    803         : "cc", "memory",
    804           "q0", "q1", "q2", "q3",
    805           "q8", "q9", "q10", "q11"
    806     );
    807 }
    808 
    809 template <>
    810 inline void Process<2, 8>(int32_t* const out,
    811         int count,
    812         const int16_t* coefsP,
    813         const int16_t* coefsN,
    814         const int16_t* coefsP1,
    815         const int16_t* coefsN1,
    816         const int16_t* sP,
    817         const int16_t* sN,
    818         uint32_t lerpP,
    819         const int32_t* const volumeLR)
    820 {
    821     const int CHANNELS = 2; // template specialization does not preserve params
    822     const int STRIDE = 8;
    823     sP -= CHANNELS*((STRIDE>>1)-1);
    824     asm (
    825         "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase
    826         "veor           q0, q0, q0               \n"// (1) acc_L = 0
    827         "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
    828 
    829         "1:                                      \n"
    830 
    831         "vld2.16        {d4, d5}, [%[sP]]        \n"// (3+0d) load 8 16-bits stereo samples
    832         "vld2.16        {d6, d7}, [%[sN]]!       \n"// (3) load 8 16-bits stereo samples
    833         "vld1.16        {d16}, [%[coefsP0]:64]!  \n"// (1) load 8 16-bits coefs
    834         "vld1.16        {d17}, [%[coefsP1]:64]!  \n"// (1) load 8 16-bits coefs for interpolation
    835         "vld1.16        {d20}, [%[coefsN1]:64]!  \n"// (1) load 8 16-bits coefs
    836         "vld1.16        {d21}, [%[coefsN0]:64]!  \n"// (1) load 8 16-bits coefs for interpolation
    837 
    838         "vsub.s16       d17, d17, d16            \n"// (1) interpolate (step1) 1st set of coefs
    839         "vsub.s16       d21, d21, d20            \n"// (1) interpolate (step1) 2nd set of coets
    840 
    841         "vqrdmulh.s16   d17, d17, d2[0]          \n"// (2) interpolate (step2) 1st set of coefs
    842         "vqrdmulh.s16   d21, d21, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
    843 
    844         "vrev64.16      q2, q2                   \n"// (1) reverse 8 frames of the left positive
    845 
    846         "vadd.s16       d16, d16, d17            \n"// (1+1d) interpolate (step3) 1st set
    847         "vadd.s16       d20, d20, d21            \n"// (1+1d) interpolate (step3) 2nd set
    848 
    849         "vmlal.s16      q0, d4, d16              \n"// (1) multiply (reversed) samples left
    850         "vmlal.s16      q4, d5, d16              \n"// (1) multiply (reversed) samples right
    851         "vmlal.s16      q0, d6, d20              \n"// (1) multiply samples left
    852         "vmlal.s16      q4, d7, d20              \n"// (1) multiply samples right
    853 
    854         // moving these ARM before neon seems to be slower
    855         "subs           %[count], %[count], #4   \n"// (1) update loop counter
    856         "sub            %[sP], %[sP], #16        \n"// move pointer to next set of samples
    857 
    858         // sP used after branch (warning)
    859         "bne            1b                       \n"// loop
    860 
    861         ASSEMBLY_ACCUMULATE_STEREO
    862 
    863         : [out] "=Uv" (out[0]),
    864           [count] "+r" (count),
    865           [coefsP0] "+r" (coefsP),
    866           [coefsN0] "+r" (coefsN),
    867           [coefsP1] "+r" (coefsP1),
    868           [coefsN1] "+r" (coefsN1),
    869           [sP] "+r" (sP),
    870           [sN] "+r" (sN)
    871         : [lerpP]   "r" (lerpP),
    872           [vLR] "r" (volumeLR)
    873         : "cc", "memory",
    874           "q0", "q1", "q2", "q3",
    875           "q4", "q5", "q6",
    876           "q8", "q9", "q10", "q11"
    877     );
    878 }
    879 
    880 template <>
    881 inline void ProcessL<1, 8>(int32_t* const out,
    882         int count,
    883         const int32_t* coefsP,
    884         const int32_t* coefsN,
    885         const int16_t* sP,
    886         const int16_t* sN,
    887         const int32_t* const volumeLR)
    888 {
    889     const int CHANNELS = 1; // template specialization does not preserve params
    890     const int STRIDE = 8;
    891     sP -= CHANNELS*((STRIDE>>1)-1);
    892     asm (
    893         "veor           q0, q0, q0               \n"// result, initialize to 0
    894 
    895         "1:                                      \n"
    896 
    897         "vld1.16        {d4}, [%[sP]]            \n"// load 4 16-bits mono samples
    898         "vld1.16        {d6}, [%[sN]]!           \n"// load 4 16-bits mono samples
    899         "vld1.32        {q8}, [%[coefsP0]:128]!  \n"// load 4 32-bits coefs
    900         "vld1.32        {q10}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs
    901 
    902         "vrev64.16      d4, d4                   \n"// reverse 2 frames of the positive side
    903 
    904         "vshll.s16      q12, d4, #15             \n"// (stall) extend samples to 31 bits
    905         "vshll.s16      q14, d6, #15             \n"// extend samples to 31 bits
    906 
    907         "vqrdmulh.s32   q12, q12, q8             \n"// multiply samples by interpolated coef
    908         "vqrdmulh.s32   q14, q14, q10            \n"// multiply samples by interpolated coef
    909 
    910         "vadd.s32       q0, q0, q12              \n"// accumulate result
    911         "vadd.s32       q0, q0, q14              \n"// (stall) accumulate result
    912 
    913         "subs           %[count], %[count], #4   \n"// update loop counter
    914         "sub            %[sP], %[sP], #8         \n"// move pointer to next set of samples
    915 
    916         "bne            1b                       \n"// loop
    917 
    918         ASSEMBLY_ACCUMULATE_MONO
    919 
    920         : [out] "=Uv" (out[0]),
    921           [count] "+r" (count),
    922           [coefsP0] "+r" (coefsP),
    923           [coefsN0] "+r" (coefsN),
    924           [sP] "+r" (sP),
    925           [sN] "+r" (sN)
    926         : [vLR] "r" (volumeLR)
    927         : "cc", "memory",
    928           "q0", "q1", "q2", "q3",
    929           "q8", "q9", "q10", "q11",
    930           "q12", "q14"
    931     );
    932 }
    933 
    934 template <>
    935 inline void ProcessL<2, 8>(int32_t* const out,
    936         int count,
    937         const int32_t* coefsP,
    938         const int32_t* coefsN,
    939         const int16_t* sP,
    940         const int16_t* sN,
    941         const int32_t* const volumeLR)
    942 {
    943     const int CHANNELS = 2; // template specialization does not preserve params
    944     const int STRIDE = 8;
    945     sP -= CHANNELS*((STRIDE>>1)-1);
    946     asm (
    947         "veor           q0, q0, q0               \n"// result, initialize to 0
    948         "veor           q4, q4, q4               \n"// result, initialize to 0
    949 
    950         "1:                                      \n"
    951 
    952         "vld2.16        {d4, d5}, [%[sP]]        \n"// load 4 16-bits stereo samples
    953         "vld2.16        {d6, d7}, [%[sN]]!       \n"// load 4 16-bits stereo samples
    954         "vld1.32        {q8}, [%[coefsP0]:128]!  \n"// load 4 32-bits coefs
    955         "vld1.32        {q10}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs
    956 
    957         "vrev64.16      q2, q2                   \n"// reverse 2 frames of the positive side
    958 
    959         "vshll.s16      q12, d4, #15             \n"// extend samples to 31 bits
    960         "vshll.s16      q13, d5, #15             \n"// extend samples to 31 bits
    961 
    962         "vshll.s16      q14, d6, #15             \n"// extend samples to 31 bits
    963         "vshll.s16      q15, d7, #15             \n"// extend samples to 31 bits
    964 
    965         "vqrdmulh.s32   q12, q12, q8             \n"// multiply samples by coef
    966         "vqrdmulh.s32   q13, q13, q8             \n"// multiply samples by coef
    967         "vqrdmulh.s32   q14, q14, q10            \n"// multiply samples by coef
    968         "vqrdmulh.s32   q15, q15, q10            \n"// multiply samples by coef
    969 
    970         "vadd.s32       q0, q0, q12              \n"// accumulate result
    971         "vadd.s32       q4, q4, q13              \n"// accumulate result
    972         "vadd.s32       q0, q0, q14              \n"// accumulate result
    973         "vadd.s32       q4, q4, q15              \n"// accumulate result
    974 
    975         "subs           %[count], %[count], #4   \n"// update loop counter
    976         "sub            %[sP], %[sP], #16        \n"// move pointer to next set of samples
    977 
    978         "bne            1b                       \n"// loop
    979 
    980         ASSEMBLY_ACCUMULATE_STEREO
    981 
    982         : [out]     "=Uv" (out[0]),
    983           [count]   "+r" (count),
    984           [coefsP0] "+r" (coefsP),
    985           [coefsN0] "+r" (coefsN),
    986           [sP]      "+r" (sP),
    987           [sN]      "+r" (sN)
    988         : [vLR]     "r" (volumeLR)
    989         : "cc", "memory",
    990           "q0", "q1", "q2", "q3", "q4",
    991           "q8", "q9", "q10", "q11",
    992           "q12", "q13", "q14", "q15"
    993     );
    994 }
    995 
    996 template <>
    997 inline void Process<1, 8>(int32_t* const out,
    998         int count,
    999         const int32_t* coefsP,
   1000         const int32_t* coefsN,
   1001         const int32_t* coefsP1,
   1002         const int32_t* coefsN1,
   1003         const int16_t* sP,
   1004         const int16_t* sN,
   1005         uint32_t lerpP,
   1006         const int32_t* const volumeLR)
   1007 {
   1008     const int CHANNELS = 1; // template specialization does not preserve params
   1009     const int STRIDE = 8;
   1010     sP -= CHANNELS*((STRIDE>>1)-1);
   1011     asm (
   1012         "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase
   1013         "veor           q0, q0, q0               \n"// result, initialize to 0
   1014 
   1015         "1:                                      \n"
   1016 
   1017         "vld1.16        {d4}, [%[sP]]            \n"// load 4 16-bits mono samples
   1018         "vld1.16        {d6}, [%[sN]]!           \n"// load 4 16-bits mono samples
   1019         "vld1.32        {q8}, [%[coefsP0]:128]!  \n"// load 4 32-bits coefs
   1020         "vld1.32        {q9}, [%[coefsP1]:128]!  \n"// load 4 32-bits coefs for interpolation
   1021         "vld1.32        {q10}, [%[coefsN1]:128]! \n"// load 4 32-bits coefs
   1022         "vld1.32        {q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs for interpolation
   1023 
   1024         "vrev64.16      d4, d4                   \n"// reverse 2 frames of the positive side
   1025 
   1026         "vsub.s32       q9, q9, q8               \n"// interpolate (step1) 1st set of coefs
   1027         "vsub.s32       q11, q11, q10            \n"// interpolate (step1) 2nd set of coets
   1028         "vshll.s16      q12, d4, #15             \n"// extend samples to 31 bits
   1029 
   1030         "vqrdmulh.s32   q9, q9, d2[0]            \n"// interpolate (step2) 1st set of coefs
   1031         "vqrdmulh.s32   q11, q11, d2[0]          \n"// interpolate (step2) 2nd set of coefs
   1032         "vshll.s16      q14, d6, #15             \n"// extend samples to 31 bits
   1033 
   1034         "vadd.s32       q8, q8, q9               \n"// interpolate (step3) 1st set
   1035         "vadd.s32       q10, q10, q11            \n"// interpolate (step4) 2nd set
   1036 
   1037         "vqrdmulh.s32   q12, q12, q8             \n"// multiply samples by interpolated coef
   1038         "vqrdmulh.s32   q14, q14, q10            \n"// multiply samples by interpolated coef
   1039 
   1040         "vadd.s32       q0, q0, q12              \n"// accumulate result
   1041         "vadd.s32       q0, q0, q14              \n"// accumulate result
   1042 
   1043         "subs           %[count], %[count], #4   \n"// update loop counter
   1044         "sub            %[sP], %[sP], #8         \n"// move pointer to next set of samples
   1045 
   1046         "bne            1b                       \n"// loop
   1047 
   1048         ASSEMBLY_ACCUMULATE_MONO
   1049 
   1050         : [out]     "=Uv" (out[0]),
   1051           [count]   "+r" (count),
   1052           [coefsP0] "+r" (coefsP),
   1053           [coefsP1] "+r" (coefsP1),
   1054           [coefsN0] "+r" (coefsN),
   1055           [coefsN1] "+r" (coefsN1),
   1056           [sP]      "+r" (sP),
   1057           [sN]      "+r" (sN)
   1058         : [lerpP]   "r" (lerpP),
   1059           [vLR]     "r" (volumeLR)
   1060         : "cc", "memory",
   1061           "q0", "q1", "q2", "q3",
   1062           "q8", "q9", "q10", "q11",
   1063           "q12", "q14"
   1064     );
   1065 }
   1066 
   1067 template <>
   1068 inline
   1069 void Process<2, 8>(int32_t* const out,
   1070         int count,
   1071         const int32_t* coefsP,
   1072         const int32_t* coefsN,
   1073         const int32_t* coefsP1,
   1074         const int32_t* coefsN1,
   1075         const int16_t* sP,
   1076         const int16_t* sN,
   1077         uint32_t lerpP,
   1078         const int32_t* const volumeLR)
   1079 {
   1080     const int CHANNELS = 2; // template specialization does not preserve params
   1081     const int STRIDE = 8;
   1082     sP -= CHANNELS*((STRIDE>>1)-1);
   1083     asm (
   1084         "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase
   1085         "veor           q0, q0, q0               \n"// result, initialize to 0
   1086         "veor           q4, q4, q4               \n"// result, initialize to 0
   1087 
   1088         "1:                                      \n"
   1089         "vld2.16        {d4, d5}, [%[sP]]        \n"// load 4 16-bits stereo samples
   1090         "vld2.16        {d6, d7}, [%[sN]]!       \n"// load 4 16-bits stereo samples
   1091         "vld1.32        {q8}, [%[coefsP0]:128]!  \n"// load 4 32-bits coefs
   1092         "vld1.32        {q9}, [%[coefsP1]:128]!  \n"// load 4 32-bits coefs for interpolation
   1093         "vld1.32        {q10}, [%[coefsN1]:128]! \n"// load 4 32-bits coefs
   1094         "vld1.32        {q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs for interpolation
   1095 
   1096         "vrev64.16      q2, q2                   \n"// (reversed) 2 frames of the positive side
   1097 
   1098         "vsub.s32       q9, q9, q8               \n"// interpolate (step1) 1st set of coefs
   1099         "vsub.s32       q11, q11, q10            \n"// interpolate (step1) 2nd set of coets
   1100         "vshll.s16      q12, d4, #15             \n"// extend samples to 31 bits
   1101         "vshll.s16      q13, d5, #15             \n"// extend samples to 31 bits
   1102 
   1103         "vqrdmulh.s32   q9, q9, d2[0]            \n"// interpolate (step2) 1st set of coefs
   1104         "vqrdmulh.s32   q11, q11, d2[1]          \n"// interpolate (step3) 2nd set of coefs
   1105         "vshll.s16      q14, d6, #15             \n"// extend samples to 31 bits
   1106         "vshll.s16      q15, d7, #15             \n"// extend samples to 31 bits
   1107 
   1108         "vadd.s32       q8, q8, q9               \n"// interpolate (step3) 1st set
   1109         "vadd.s32       q10, q10, q11            \n"// interpolate (step4) 2nd set
   1110 
   1111         "vqrdmulh.s32   q12, q12, q8             \n"// multiply samples by interpolated coef
   1112         "vqrdmulh.s32   q13, q13, q8             \n"// multiply samples by interpolated coef
   1113         "vqrdmulh.s32   q14, q14, q10            \n"// multiply samples by interpolated coef
   1114         "vqrdmulh.s32   q15, q15, q10            \n"// multiply samples by interpolated coef
   1115 
   1116         "vadd.s32       q0, q0, q12              \n"// accumulate result
   1117         "vadd.s32       q4, q4, q13              \n"// accumulate result
   1118         "vadd.s32       q0, q0, q14              \n"// accumulate result
   1119         "vadd.s32       q4, q4, q15              \n"// accumulate result
   1120 
   1121         "subs           %[count], %[count], #4   \n"// update loop counter
   1122         "sub            %[sP], %[sP], #16        \n"// move pointer to next set of samples
   1123 
   1124         "bne            1b                       \n"// loop
   1125 
   1126         ASSEMBLY_ACCUMULATE_STEREO
   1127 
   1128         : [out]     "=Uv" (out[0]),
   1129           [count]   "+r" (count),
   1130           [coefsP0] "+r" (coefsP),
   1131           [coefsP1] "+r" (coefsP1),
   1132           [coefsN0] "+r" (coefsN),
   1133           [coefsN1] "+r" (coefsN1),
   1134           [sP]      "+r" (sP),
   1135           [sN]      "+r" (sN)
   1136         : [lerpP]   "r" (lerpP),
   1137           [vLR]     "r" (volumeLR)
   1138         : "cc", "memory",
   1139           "q0", "q1", "q2", "q3", "q4",
   1140           "q8", "q9", "q10", "q11",
   1141           "q12", "q13", "q14", "q15"
   1142     );
   1143 }
   1144 
   1145 #endif //USE_NEON
   1146 
   1147 }; // namespace android
   1148 
   1149 #endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H*/
   1150