Home | History | Annotate | Download | only in x86
      1 /* Copyright (c) 2014, Cisco Systems, INC
      2    Written by XiangMingZhu WeiZhou MinPeng YanWang
      3 
      4    Redistribution and use in source and binary forms, with or without
      5    modification, are permitted provided that the following conditions
      6    are met:
      7 
      8    - Redistributions of source code must retain the above copyright
      9    notice, this list of conditions and the following disclaimer.
     10 
     11    - Redistributions in binary form must reproduce the above copyright
     12    notice, this list of conditions and the following disclaimer in the
     13    documentation and/or other materials provided with the distribution.
     14 
     15    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     16    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     17    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     18    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
     19    OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     20    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     21    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     22    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     23    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     24    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #ifdef HAVE_CONFIG_H
     29 #include "config.h"
     30 #endif
     31 
     32 #include <xmmintrin.h>
     33 #include <emmintrin.h>
     34 #include <smmintrin.h>
     35 
     36 #include "SigProc_FIX.h"
     37 #include "define.h"
     38 #include "tuning_parameters.h"
     39 #include "pitch.h"
     40 #include "celt/x86/x86cpu.h"
     41 
     42 #define MAX_FRAME_SIZE              384             /* subfr_length * nb_subfr = ( 0.005 * 16000 + 16 ) * 4 = 384 */
     43 
     44 #define QA                          25
     45 #define N_BITS_HEAD_ROOM            2
     46 #define MIN_RSHIFTS                 -16
     47 #define MAX_RSHIFTS                 (32 - QA)
     48 
     49 /* Compute reflection coefficients from input signal */
     50 void silk_burg_modified_sse4_1(
     51     opus_int32                  *res_nrg,           /* O    Residual energy                                             */
     52     opus_int                    *res_nrg_Q,         /* O    Residual energy Q value                                     */
     53     opus_int32                  A_Q16[],            /* O    Prediction coefficients (length order)                      */
     54     const opus_int16            x[],                /* I    Input signal, length: nb_subfr * ( D + subfr_length )       */
     55     const opus_int32            minInvGain_Q30,     /* I    Inverse of max prediction gain                              */
     56     const opus_int              subfr_length,       /* I    Input signal subframe length (incl. D preceding samples)    */
     57     const opus_int              nb_subfr,           /* I    Number of subframes stacked in x                            */
     58     const opus_int              D,                  /* I    Order                                                       */
     59     int                         arch                /* I    Run-time architecture                                       */
     60 )
     61 {
     62     opus_int         k, n, s, lz, rshifts, rshifts_extra, reached_max_gain;
     63     opus_int32       C0, num, nrg, rc_Q31, invGain_Q30, Atmp_QA, Atmp1, tmp1, tmp2, x1, x2;
     64     const opus_int16 *x_ptr;
     65     opus_int32       C_first_row[ SILK_MAX_ORDER_LPC ];
     66     opus_int32       C_last_row[  SILK_MAX_ORDER_LPC ];
     67     opus_int32       Af_QA[       SILK_MAX_ORDER_LPC ];
     68     opus_int32       CAf[ SILK_MAX_ORDER_LPC + 1 ];
     69     opus_int32       CAb[ SILK_MAX_ORDER_LPC + 1 ];
     70     opus_int32       xcorr[ SILK_MAX_ORDER_LPC ];
     71 
     72     __m128i FIRST_3210, LAST_3210, ATMP_3210, TMP1_3210, TMP2_3210, T1_3210, T2_3210, PTR_3210, SUBFR_3210, X1_3210, X2_3210;
     73     __m128i CONST1 = _mm_set1_epi32(1);
     74 
     75     silk_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE );
     76 
     77     /* Compute autocorrelations, added over subframes */
     78     silk_sum_sqr_shift( &C0, &rshifts, x, nb_subfr * subfr_length );
     79     if( rshifts > MAX_RSHIFTS ) {
     80         C0 = silk_LSHIFT32( C0, rshifts - MAX_RSHIFTS );
     81         silk_assert( C0 > 0 );
     82         rshifts = MAX_RSHIFTS;
     83     } else {
     84         lz = silk_CLZ32( C0 ) - 1;
     85         rshifts_extra = N_BITS_HEAD_ROOM - lz;
     86         if( rshifts_extra > 0 ) {
     87             rshifts_extra = silk_min( rshifts_extra, MAX_RSHIFTS - rshifts );
     88             C0 = silk_RSHIFT32( C0, rshifts_extra );
     89         } else {
     90             rshifts_extra = silk_max( rshifts_extra, MIN_RSHIFTS - rshifts );
     91             C0 = silk_LSHIFT32( C0, -rshifts_extra );
     92         }
     93         rshifts += rshifts_extra;
     94     }
     95     CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ) + 1;                                /* Q(-rshifts) */
     96     silk_memset( C_first_row, 0, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) );
     97     if( rshifts > 0 ) {
     98         for( s = 0; s < nb_subfr; s++ ) {
     99             x_ptr = x + s * subfr_length;
    100             for( n = 1; n < D + 1; n++ ) {
    101                 C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64(
    102                     silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
    103             }
    104         }
    105     } else {
    106         for( s = 0; s < nb_subfr; s++ ) {
    107             int i;
    108             opus_int32 d;
    109             x_ptr = x + s * subfr_length;
    110             celt_pitch_xcorr(x_ptr, x_ptr + 1, xcorr, subfr_length - D, D, arch );
    111             for( n = 1; n < D + 1; n++ ) {
    112                for ( i = n + subfr_length - D, d = 0; i < subfr_length; i++ )
    113                   d = MAC16_16( d, x_ptr[ i ], x_ptr[ i - n ] );
    114                xcorr[ n - 1 ] += d;
    115             }
    116             for( n = 1; n < D + 1; n++ ) {
    117                 C_first_row[ n - 1 ] += silk_LSHIFT32( xcorr[ n - 1 ], -rshifts );
    118             }
    119         }
    120     }
    121     silk_memcpy( C_last_row, C_first_row, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) );
    122 
    123     /* Initialize */
    124     CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ) + 1;                                /* Q(-rshifts) */
    125 
    126     invGain_Q30 = (opus_int32)1 << 30;
    127     reached_max_gain = 0;
    128     for( n = 0; n < D; n++ ) {
    129         /* Update first row of correlation matrix (without first element) */
    130         /* Update last row of correlation matrix (without last element, stored in reversed order) */
    131         /* Update C * Af */
    132         /* Update C * flipud(Af) (stored in reversed order) */
    133         if( rshifts > -2 ) {
    134             for( s = 0; s < nb_subfr; s++ ) {
    135                 x_ptr = x + s * subfr_length;
    136                 x1  = -silk_LSHIFT32( (opus_int32)x_ptr[ n ],                    16 - rshifts );        /* Q(16-rshifts) */
    137                 x2  = -silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], 16 - rshifts );        /* Q(16-rshifts) */
    138                 tmp1 = silk_LSHIFT32( (opus_int32)x_ptr[ n ],                    QA - 16 );             /* Q(QA-16) */
    139                 tmp2 = silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], QA - 16 );             /* Q(QA-16) */
    140                 for( k = 0; k < n; k++ ) {
    141                     C_first_row[ k ] = silk_SMLAWB( C_first_row[ k ], x1, x_ptr[ n - k - 1 ]            ); /* Q( -rshifts ) */
    142                     C_last_row[ k ]  = silk_SMLAWB( C_last_row[ k ],  x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */
    143                     Atmp_QA = Af_QA[ k ];
    144                     tmp1 = silk_SMLAWB( tmp1, Atmp_QA, x_ptr[ n - k - 1 ]            );                 /* Q(QA-16) */
    145                     tmp2 = silk_SMLAWB( tmp2, Atmp_QA, x_ptr[ subfr_length - n + k ] );                 /* Q(QA-16) */
    146                 }
    147                 tmp1 = silk_LSHIFT32( -tmp1, 32 - QA - rshifts );                                       /* Q(16-rshifts) */
    148                 tmp2 = silk_LSHIFT32( -tmp2, 32 - QA - rshifts );                                       /* Q(16-rshifts) */
    149                 for( k = 0; k <= n; k++ ) {
    150                     CAf[ k ] = silk_SMLAWB( CAf[ k ], tmp1, x_ptr[ n - k ]                    );        /* Q( -rshift ) */
    151                     CAb[ k ] = silk_SMLAWB( CAb[ k ], tmp2, x_ptr[ subfr_length - n + k - 1 ] );        /* Q( -rshift ) */
    152                 }
    153             }
    154         } else {
    155             for( s = 0; s < nb_subfr; s++ ) {
    156                 x_ptr = x + s * subfr_length;
    157                 x1  = -silk_LSHIFT32( (opus_int32)x_ptr[ n ],                    -rshifts );            /* Q( -rshifts ) */
    158                 x2  = -silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], -rshifts );            /* Q( -rshifts ) */
    159                 tmp1 = silk_LSHIFT32( (opus_int32)x_ptr[ n ],                    17 );                  /* Q17 */
    160                 tmp2 = silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], 17 );                  /* Q17 */
    161 
    162                 X1_3210 = _mm_set1_epi32( x1 );
    163                 X2_3210 = _mm_set1_epi32( x2 );
    164                 TMP1_3210 = _mm_setzero_si128();
    165                 TMP2_3210 = _mm_setzero_si128();
    166                 for( k = 0; k < n - 3; k += 4 ) {
    167                     PTR_3210   = OP_CVTEPI16_EPI32_M64( &x_ptr[ n - k - 1 - 3 ] );
    168                     SUBFR_3210 = OP_CVTEPI16_EPI32_M64( &x_ptr[ subfr_length - n + k ] );
    169                     FIRST_3210 = _mm_loadu_si128( (__m128i *)&C_first_row[ k ] );
    170                     PTR_3210   = _mm_shuffle_epi32( PTR_3210,  _MM_SHUFFLE( 0, 1, 2, 3 ) );
    171                     LAST_3210  = _mm_loadu_si128( (__m128i *)&C_last_row[ k ] );
    172                     ATMP_3210  = _mm_loadu_si128( (__m128i *)&Af_QA[ k ] );
    173 
    174                     T1_3210 = _mm_mullo_epi32( PTR_3210, X1_3210 );
    175                     T2_3210 = _mm_mullo_epi32( SUBFR_3210, X2_3210 );
    176 
    177                     ATMP_3210 = _mm_srai_epi32( ATMP_3210, 7 );
    178                     ATMP_3210 = _mm_add_epi32( ATMP_3210, CONST1 );
    179                     ATMP_3210 = _mm_srai_epi32( ATMP_3210, 1 );
    180 
    181                     FIRST_3210 = _mm_add_epi32( FIRST_3210, T1_3210 );
    182                     LAST_3210 = _mm_add_epi32( LAST_3210, T2_3210 );
    183 
    184                     PTR_3210   = _mm_mullo_epi32( ATMP_3210, PTR_3210 );
    185                     SUBFR_3210   = _mm_mullo_epi32( ATMP_3210, SUBFR_3210 );
    186 
    187                     _mm_storeu_si128( (__m128i *)&C_first_row[ k ], FIRST_3210 );
    188                     _mm_storeu_si128( (__m128i *)&C_last_row[ k ], LAST_3210 );
    189 
    190                     TMP1_3210 = _mm_add_epi32( TMP1_3210, PTR_3210 );
    191                     TMP2_3210 = _mm_add_epi32( TMP2_3210, SUBFR_3210 );
    192                 }
    193 
    194                 TMP1_3210 = _mm_add_epi32( TMP1_3210, _mm_unpackhi_epi64(TMP1_3210, TMP1_3210 ) );
    195                 TMP2_3210 = _mm_add_epi32( TMP2_3210, _mm_unpackhi_epi64(TMP2_3210, TMP2_3210 ) );
    196                 TMP1_3210 = _mm_add_epi32( TMP1_3210, _mm_shufflelo_epi16(TMP1_3210, 0x0E ) );
    197                 TMP2_3210 = _mm_add_epi32( TMP2_3210, _mm_shufflelo_epi16(TMP2_3210, 0x0E ) );
    198 
    199                 tmp1 += _mm_cvtsi128_si32( TMP1_3210 );
    200                 tmp2 += _mm_cvtsi128_si32( TMP2_3210 );
    201 
    202                 for( ; k < n; k++ ) {
    203                     C_first_row[ k ] = silk_MLA( C_first_row[ k ], x1, x_ptr[ n - k - 1 ]            ); /* Q( -rshifts ) */
    204                     C_last_row[ k ]  = silk_MLA( C_last_row[ k ],  x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */
    205                     Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 17 );                                   /* Q17 */
    206                     tmp1 = silk_MLA( tmp1, x_ptr[ n - k - 1 ],            Atmp1 );                      /* Q17 */
    207                     tmp2 = silk_MLA( tmp2, x_ptr[ subfr_length - n + k ], Atmp1 );                      /* Q17 */
    208                 }
    209 
    210                 tmp1 = -tmp1;                /* Q17 */
    211                 tmp2 = -tmp2;                /* Q17 */
    212 
    213                 {
    214                     __m128i xmm_tmp1, xmm_tmp2;
    215                     __m128i xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1;
    216                     __m128i xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1;
    217 
    218                     xmm_tmp1 = _mm_set1_epi32( tmp1 );
    219                     xmm_tmp2 = _mm_set1_epi32( tmp2 );
    220 
    221                     for( k = 0; k <= n - 3; k += 4 ) {
    222                         xmm_x_ptr_n_k_x2x0 = OP_CVTEPI16_EPI32_M64( &x_ptr[ n - k - 3 ] );
    223                         xmm_x_ptr_sub_x2x0 = OP_CVTEPI16_EPI32_M64( &x_ptr[ subfr_length - n + k - 1 ] );
    224 
    225                         xmm_x_ptr_n_k_x2x0 = _mm_shuffle_epi32( xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE( 0, 1, 2, 3 ) );
    226 
    227                         xmm_x_ptr_n_k_x2x0 = _mm_slli_epi32( xmm_x_ptr_n_k_x2x0, -rshifts - 1 );
    228                         xmm_x_ptr_sub_x2x0 = _mm_slli_epi32( xmm_x_ptr_sub_x2x0, -rshifts - 1 );
    229 
    230                         /* equal shift right 4 bytes, xmm_x_ptr_n_k_x3x1 = _mm_srli_si128(xmm_x_ptr_n_k_x2x0, 4)*/
    231                         xmm_x_ptr_n_k_x3x1 = _mm_shuffle_epi32( xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
    232                         xmm_x_ptr_sub_x3x1 = _mm_shuffle_epi32( xmm_x_ptr_sub_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
    233 
    234                         xmm_x_ptr_n_k_x2x0 = _mm_mul_epi32( xmm_x_ptr_n_k_x2x0, xmm_tmp1 );
    235                         xmm_x_ptr_n_k_x3x1 = _mm_mul_epi32( xmm_x_ptr_n_k_x3x1, xmm_tmp1 );
    236                         xmm_x_ptr_sub_x2x0 = _mm_mul_epi32( xmm_x_ptr_sub_x2x0, xmm_tmp2 );
    237                         xmm_x_ptr_sub_x3x1 = _mm_mul_epi32( xmm_x_ptr_sub_x3x1, xmm_tmp2 );
    238 
    239                         xmm_x_ptr_n_k_x2x0 = _mm_srli_epi64( xmm_x_ptr_n_k_x2x0, 16 );
    240                         xmm_x_ptr_n_k_x3x1 = _mm_slli_epi64( xmm_x_ptr_n_k_x3x1, 16 );
    241                         xmm_x_ptr_sub_x2x0 = _mm_srli_epi64( xmm_x_ptr_sub_x2x0, 16 );
    242                         xmm_x_ptr_sub_x3x1 = _mm_slli_epi64( xmm_x_ptr_sub_x3x1, 16 );
    243 
    244                         xmm_x_ptr_n_k_x2x0 = _mm_blend_epi16( xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1, 0xCC );
    245                         xmm_x_ptr_sub_x2x0 = _mm_blend_epi16( xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1, 0xCC );
    246 
    247                         X1_3210  = _mm_loadu_si128( (__m128i *)&CAf[ k ] );
    248                         PTR_3210 = _mm_loadu_si128( (__m128i *)&CAb[ k ] );
    249 
    250                         X1_3210  = _mm_add_epi32( X1_3210, xmm_x_ptr_n_k_x2x0 );
    251                         PTR_3210 = _mm_add_epi32( PTR_3210, xmm_x_ptr_sub_x2x0 );
    252 
    253                         _mm_storeu_si128( (__m128i *)&CAf[ k ], X1_3210 );
    254                         _mm_storeu_si128( (__m128i *)&CAb[ k ], PTR_3210 );
    255                     }
    256 
    257                     for( ; k <= n; k++ ) {
    258                         CAf[ k ] = silk_SMLAWW( CAf[ k ], tmp1,
    259                             silk_LSHIFT32( (opus_int32)x_ptr[ n - k ], -rshifts - 1 ) );                    /* Q( -rshift ) */
    260                         CAb[ k ] = silk_SMLAWW( CAb[ k ], tmp2,
    261                             silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n + k - 1 ], -rshifts - 1 ) ); /* Q( -rshift ) */
    262                     }
    263                 }
    264             }
    265         }
    266 
    267         /* Calculate nominator and denominator for the next order reflection (parcor) coefficient */
    268         tmp1 = C_first_row[ n ];                                                                        /* Q( -rshifts ) */
    269         tmp2 = C_last_row[ n ];                                                                         /* Q( -rshifts ) */
    270         num  = 0;                                                                                       /* Q( -rshifts ) */
    271         nrg  = silk_ADD32( CAb[ 0 ], CAf[ 0 ] );                                                        /* Q( 1-rshifts ) */
    272         for( k = 0; k < n; k++ ) {
    273             Atmp_QA = Af_QA[ k ];
    274             lz = silk_CLZ32( silk_abs( Atmp_QA ) ) - 1;
    275             lz = silk_min( 32 - QA, lz );
    276             Atmp1 = silk_LSHIFT32( Atmp_QA, lz );                                                       /* Q( QA + lz ) */
    277 
    278             tmp1 = silk_ADD_LSHIFT32( tmp1, silk_SMMUL( C_last_row[  n - k - 1 ], Atmp1 ), 32 - QA - lz );  /* Q( -rshifts ) */
    279             tmp2 = silk_ADD_LSHIFT32( tmp2, silk_SMMUL( C_first_row[ n - k - 1 ], Atmp1 ), 32 - QA - lz );  /* Q( -rshifts ) */
    280             num  = silk_ADD_LSHIFT32( num,  silk_SMMUL( CAb[ n - k ],             Atmp1 ), 32 - QA - lz );  /* Q( -rshifts ) */
    281             nrg  = silk_ADD_LSHIFT32( nrg,  silk_SMMUL( silk_ADD32( CAb[ k + 1 ], CAf[ k + 1 ] ),
    282                                                                                 Atmp1 ), 32 - QA - lz );    /* Q( 1-rshifts ) */
    283         }
    284         CAf[ n + 1 ] = tmp1;                                                                            /* Q( -rshifts ) */
    285         CAb[ n + 1 ] = tmp2;                                                                            /* Q( -rshifts ) */
    286         num = silk_ADD32( num, tmp2 );                                                                  /* Q( -rshifts ) */
    287         num = silk_LSHIFT32( -num, 1 );                                                                 /* Q( 1-rshifts ) */
    288 
    289         /* Calculate the next order reflection (parcor) coefficient */
    290         if( silk_abs( num ) < nrg ) {
    291             rc_Q31 = silk_DIV32_varQ( num, nrg, 31 );
    292         } else {
    293             rc_Q31 = ( num > 0 ) ? silk_int32_MAX : silk_int32_MIN;
    294         }
    295 
    296         /* Update inverse prediction gain */
    297         tmp1 = ( (opus_int32)1 << 30 ) - silk_SMMUL( rc_Q31, rc_Q31 );
    298         tmp1 = silk_LSHIFT( silk_SMMUL( invGain_Q30, tmp1 ), 2 );
    299         if( tmp1 <= minInvGain_Q30 ) {
    300             /* Max prediction gain exceeded; set reflection coefficient such that max prediction gain is exactly hit */
    301             tmp2 = ( (opus_int32)1 << 30 ) - silk_DIV32_varQ( minInvGain_Q30, invGain_Q30, 30 );            /* Q30 */
    302             rc_Q31 = silk_SQRT_APPROX( tmp2 );                                                  /* Q15 */
    303             if( rc_Q31 > 0 ) {
    304                  /* Newton-Raphson iteration */
    305                 rc_Q31 = silk_RSHIFT32( rc_Q31 + silk_DIV32( tmp2, rc_Q31 ), 1 );                   /* Q15 */
    306                 rc_Q31 = silk_LSHIFT32( rc_Q31, 16 );                                               /* Q31 */
    307                 if( num < 0 ) {
    308                     /* Ensure adjusted reflection coefficients has the original sign */
    309                     rc_Q31 = -rc_Q31;
    310                 }
    311             }
    312             invGain_Q30 = minInvGain_Q30;
    313             reached_max_gain = 1;
    314         } else {
    315             invGain_Q30 = tmp1;
    316         }
    317 
    318         /* Update the AR coefficients */
    319         for( k = 0; k < (n + 1) >> 1; k++ ) {
    320             tmp1 = Af_QA[ k ];                                                                  /* QA */
    321             tmp2 = Af_QA[ n - k - 1 ];                                                          /* QA */
    322             Af_QA[ k ]         = silk_ADD_LSHIFT32( tmp1, silk_SMMUL( tmp2, rc_Q31 ), 1 );      /* QA */
    323             Af_QA[ n - k - 1 ] = silk_ADD_LSHIFT32( tmp2, silk_SMMUL( tmp1, rc_Q31 ), 1 );      /* QA */
    324         }
    325         Af_QA[ n ] = silk_RSHIFT32( rc_Q31, 31 - QA );                                          /* QA */
    326 
    327         if( reached_max_gain ) {
    328             /* Reached max prediction gain; set remaining coefficients to zero and exit loop */
    329             for( k = n + 1; k < D; k++ ) {
    330                 Af_QA[ k ] = 0;
    331             }
    332             break;
    333         }
    334 
    335         /* Update C * Af and C * Ab */
    336         for( k = 0; k <= n + 1; k++ ) {
    337             tmp1 = CAf[ k ];                                                                    /* Q( -rshifts ) */
    338             tmp2 = CAb[ n - k + 1 ];                                                            /* Q( -rshifts ) */
    339             CAf[ k ]         = silk_ADD_LSHIFT32( tmp1, silk_SMMUL( tmp2, rc_Q31 ), 1 );        /* Q( -rshifts ) */
    340             CAb[ n - k + 1 ] = silk_ADD_LSHIFT32( tmp2, silk_SMMUL( tmp1, rc_Q31 ), 1 );        /* Q( -rshifts ) */
    341         }
    342     }
    343 
    344     if( reached_max_gain ) {
    345         for( k = 0; k < D; k++ ) {
    346             /* Scale coefficients */
    347             A_Q16[ k ] = -silk_RSHIFT_ROUND( Af_QA[ k ], QA - 16 );
    348         }
    349         /* Subtract energy of preceding samples from C0 */
    350         if( rshifts > 0 ) {
    351             for( s = 0; s < nb_subfr; s++ ) {
    352                 x_ptr = x + s * subfr_length;
    353                 C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x_ptr, x_ptr, D, arch ), rshifts );
    354             }
    355         } else {
    356             for( s = 0; s < nb_subfr; s++ ) {
    357                 x_ptr = x + s * subfr_length;
    358                 C0 -= silk_LSHIFT32( silk_inner_prod_aligned( x_ptr, x_ptr, D, arch ), -rshifts );
    359             }
    360         }
    361         /* Approximate residual energy */
    362         *res_nrg = silk_LSHIFT( silk_SMMUL( invGain_Q30, C0 ), 2 );
    363         *res_nrg_Q = -rshifts;
    364     } else {
    365         /* Return residual energy */
    366         nrg  = CAf[ 0 ];                                                                            /* Q( -rshifts ) */
    367         tmp1 = (opus_int32)1 << 16;                                                                             /* Q16 */
    368         for( k = 0; k < D; k++ ) {
    369             Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 16 );                                       /* Q16 */
    370             nrg  = silk_SMLAWW( nrg, CAf[ k + 1 ], Atmp1 );                                         /* Q( -rshifts ) */
    371             tmp1 = silk_SMLAWW( tmp1, Atmp1, Atmp1 );                                               /* Q16 */
    372             A_Q16[ k ] = -Atmp1;
    373         }
    374         *res_nrg = silk_SMLAWW( nrg, silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ), -tmp1 );/* Q( -rshifts ) */
    375         *res_nrg_Q = -rshifts;
    376     }
    377 }
    378