Home | History | Annotate | Download | only in x86
      1 /* Copyright (c) 2014, Cisco Systems, INC
      2    Written by XiangMingZhu WeiZhou MinPeng YanWang
      3 
      4    Redistribution and use in source and binary forms, with or without
      5    modification, are permitted provided that the following conditions
      6    are met:
      7 
      8    - Redistributions of source code must retain the above copyright
      9    notice, this list of conditions and the following disclaimer.
     10 
     11    - Redistributions in binary form must reproduce the above copyright
     12    notice, this list of conditions and the following disclaimer in the
     13    documentation and/or other materials provided with the distribution.
     14 
     15    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     16    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     17    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     18    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
     19    OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     20    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     21    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     22    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     23    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     24    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #ifdef HAVE_CONFIG_H
     29 #include "config.h"
     30 #endif
     31 
     32 #include <xmmintrin.h>
     33 #include <emmintrin.h>
     34 #include <smmintrin.h>
     35 #include "main.h"
     36 #include "celt/x86/x86cpu.h"
     37 
     38 #include "stack_alloc.h"
     39 
     40 typedef struct {
     41     opus_int32 sLPC_Q14[ MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH ];
     42     opus_int32 RandState[ DECISION_DELAY ];
     43     opus_int32 Q_Q10[     DECISION_DELAY ];
     44     opus_int32 Xq_Q14[    DECISION_DELAY ];
     45     opus_int32 Pred_Q15[  DECISION_DELAY ];
     46     opus_int32 Shape_Q14[ DECISION_DELAY ];
     47     opus_int32 sAR2_Q14[ MAX_SHAPE_LPC_ORDER ];
     48     opus_int32 LF_AR_Q14;
     49     opus_int32 Seed;
     50     opus_int32 SeedInit;
     51     opus_int32 RD_Q10;
     52 } NSQ_del_dec_struct;
     53 
     54 typedef struct {
     55     opus_int32 Q_Q10;
     56     opus_int32 RD_Q10;
     57     opus_int32 xq_Q14;
     58     opus_int32 LF_AR_Q14;
     59     opus_int32 sLTP_shp_Q14;
     60     opus_int32 LPC_exc_Q14;
     61 } NSQ_sample_struct;
     62 
     63 typedef NSQ_sample_struct  NSQ_sample_pair[ 2 ];
     64 
     65 static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
     66     const silk_encoder_state *psEncC,               /* I    Encoder State                       */
     67     silk_nsq_state      *NSQ,                       /* I/O  NSQ state                           */
     68     NSQ_del_dec_struct  psDelDec[],                 /* I/O  Delayed decision states             */
     69     const opus_int32    x_Q3[],                     /* I    Input in Q3                         */
     70     opus_int32          x_sc_Q10[],                 /* O    Input scaled with 1/Gain in Q10     */
     71     const opus_int16    sLTP[],                     /* I    Re-whitened LTP state in Q0         */
     72     opus_int32          sLTP_Q15[],                 /* O    LTP state matching scaled input     */
     73     opus_int            subfr,                      /* I    Subframe number                     */
     74     opus_int            nStatesDelayedDecision,     /* I    Number of del dec states            */
     75     const opus_int      LTP_scale_Q14,              /* I    LTP state scaling                   */
     76     const opus_int32    Gains_Q16[ MAX_NB_SUBFR ],  /* I                                        */
     77     const opus_int      pitchL[ MAX_NB_SUBFR ],     /* I    Pitch lag                           */
     78     const opus_int      signal_type,                /* I    Signal type                         */
     79     const opus_int      decisionDelay               /* I    Decision delay                      */
     80 );
     81 
     82 /******************************************/
     83 /* Noise shape quantizer for one subframe */
     84 /******************************************/
     85 static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
     86     silk_nsq_state      *NSQ,                   /* I/O  NSQ state                           */
     87     NSQ_del_dec_struct  psDelDec[],             /* I/O  Delayed decision states             */
     88     opus_int            signalType,             /* I    Signal type                         */
     89     const opus_int32    x_Q10[],                /* I                                        */
     90     opus_int8           pulses[],               /* O                                        */
     91     opus_int16          xq[],                   /* O                                        */
     92     opus_int32          sLTP_Q15[],             /* I/O  LTP filter state                    */
     93     opus_int32          delayedGain_Q10[],      /* I/O  Gain delay buffer                   */
     94     const opus_int16    a_Q12[],                /* I    Short term prediction coefs         */
     95     const opus_int16    b_Q14[],                /* I    Long term prediction coefs          */
     96     const opus_int16    AR_shp_Q13[],           /* I    Noise shaping coefs                 */
     97     opus_int            lag,                    /* I    Pitch lag                           */
     98     opus_int32          HarmShapeFIRPacked_Q14, /* I                                        */
     99     opus_int            Tilt_Q14,               /* I    Spectral tilt                       */
    100     opus_int32          LF_shp_Q14,             /* I                                        */
    101     opus_int32          Gain_Q16,               /* I                                        */
    102     opus_int            Lambda_Q10,             /* I                                        */
    103     opus_int            offset_Q10,             /* I                                        */
    104     opus_int            length,                 /* I    Input length                        */
    105     opus_int            subfr,                  /* I    Subframe number                     */
    106     opus_int            shapingLPCOrder,        /* I    Shaping LPC filter order            */
    107     opus_int            predictLPCOrder,        /* I    Prediction filter order             */
    108     opus_int            warping_Q16,            /* I                                        */
    109     opus_int            nStatesDelayedDecision, /* I    Number of states in decision tree   */
    110     opus_int            *smpl_buf_idx,          /* I/O  Index to newest samples in buffers  */
    111     opus_int            decisionDelay           /* I                                        */
    112 );
    113 
    114 void silk_NSQ_del_dec_sse4_1(
    115     const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
    116     silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
    117     SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
    118     const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
    119     opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
    120     const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
    121     const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
    122     const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
    123     const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
    124     const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
    125     const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
    126     const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
    127     const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
    128     const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
    129     const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
    130 )
    131 {
    132     opus_int            i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;
    133     opus_int            last_smple_idx, smpl_buf_idx, decisionDelay;
    134     const opus_int16    *A_Q12, *B_Q14, *AR_shp_Q13;
    135     opus_int16          *pxq;
    136     VARDECL( opus_int32, sLTP_Q15 );
    137     VARDECL( opus_int16, sLTP );
    138     opus_int32          HarmShapeFIRPacked_Q14;
    139     opus_int            offset_Q10;
    140     opus_int32          RDmin_Q10, Gain_Q10;
    141     VARDECL( opus_int32, x_sc_Q10 );
    142     VARDECL( opus_int32, delayedGain_Q10 );
    143     VARDECL( NSQ_del_dec_struct, psDelDec );
    144     NSQ_del_dec_struct  *psDD;
    145     SAVE_STACK;
    146 
    147     /* Set unvoiced lag to the previous one, overwrite later for voiced */
    148     lag = NSQ->lagPrev;
    149 
    150     silk_assert( NSQ->prev_gain_Q16 != 0 );
    151 
    152     /* Initialize delayed decision states */
    153     ALLOC( psDelDec, psEncC->nStatesDelayedDecision, NSQ_del_dec_struct );
    154     silk_memset( psDelDec, 0, psEncC->nStatesDelayedDecision * sizeof( NSQ_del_dec_struct ) );
    155     for( k = 0; k < psEncC->nStatesDelayedDecision; k++ ) {
    156         psDD                 = &psDelDec[ k ];
    157         psDD->Seed           = ( k + psIndices->Seed ) & 3;
    158         psDD->SeedInit       = psDD->Seed;
    159         psDD->RD_Q10         = 0;
    160         psDD->LF_AR_Q14      = NSQ->sLF_AR_shp_Q14;
    161         psDD->Shape_Q14[ 0 ] = NSQ->sLTP_shp_Q14[ psEncC->ltp_mem_length - 1 ];
    162         silk_memcpy( psDD->sLPC_Q14, NSQ->sLPC_Q14, NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
    163         silk_memcpy( psDD->sAR2_Q14, NSQ->sAR2_Q14, sizeof( NSQ->sAR2_Q14 ) );
    164     }
    165 
    166     offset_Q10   = silk_Quantization_Offsets_Q10[ psIndices->signalType >> 1 ][ psIndices->quantOffsetType ];
    167     smpl_buf_idx = 0; /* index of oldest samples */
    168 
    169     decisionDelay = silk_min_int( DECISION_DELAY, psEncC->subfr_length );
    170 
    171     /* For voiced frames limit the decision delay to lower than the pitch lag */
    172     if( psIndices->signalType == TYPE_VOICED ) {
    173         for( k = 0; k < psEncC->nb_subfr; k++ ) {
    174             decisionDelay = silk_min_int( decisionDelay, pitchL[ k ] - LTP_ORDER / 2 - 1 );
    175         }
    176     } else {
    177         if( lag > 0 ) {
    178             decisionDelay = silk_min_int( decisionDelay, lag - LTP_ORDER / 2 - 1 );
    179         }
    180     }
    181 
    182     if( psIndices->NLSFInterpCoef_Q2 == 4 ) {
    183         LSF_interpolation_flag = 0;
    184     } else {
    185         LSF_interpolation_flag = 1;
    186     }
    187 
    188     ALLOC( sLTP_Q15,
    189            psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
    190     ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
    191     ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
    192     ALLOC( delayedGain_Q10, DECISION_DELAY, opus_int32 );
    193     /* Set up pointers to start of sub frame */
    194     pxq                   = &NSQ->xq[ psEncC->ltp_mem_length ];
    195     NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;
    196     NSQ->sLTP_buf_idx     = psEncC->ltp_mem_length;
    197     subfr = 0;
    198     for( k = 0; k < psEncC->nb_subfr; k++ ) {
    199         A_Q12      = &PredCoef_Q12[ ( ( k >> 1 ) | ( 1 - LSF_interpolation_flag ) ) * MAX_LPC_ORDER ];
    200         B_Q14      = &LTPCoef_Q14[ k * LTP_ORDER           ];
    201         AR_shp_Q13 = &AR2_Q13[     k * MAX_SHAPE_LPC_ORDER ];
    202 
    203         /* Noise shape parameters */
    204         silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
    205         HarmShapeFIRPacked_Q14  =                          silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 );
    206         HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 );
    207 
    208         NSQ->rewhite_flag = 0;
    209         if( psIndices->signalType == TYPE_VOICED ) {
    210             /* Voiced */
    211             lag = pitchL[ k ];
    212 
    213             /* Re-whitening */
    214             if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 ) {
    215                 if( k == 2 ) {
    216                     /* RESET DELAYED DECISIONS */
    217                     /* Find winner */
    218                     RDmin_Q10 = psDelDec[ 0 ].RD_Q10;
    219                     Winner_ind = 0;
    220                     for( i = 1; i < psEncC->nStatesDelayedDecision; i++ ) {
    221                         if( psDelDec[ i ].RD_Q10 < RDmin_Q10 ) {
    222                             RDmin_Q10 = psDelDec[ i ].RD_Q10;
    223                             Winner_ind = i;
    224                         }
    225                     }
    226                     for( i = 0; i < psEncC->nStatesDelayedDecision; i++ ) {
    227                         if( i != Winner_ind ) {
    228                             psDelDec[ i ].RD_Q10 += ( silk_int32_MAX >> 4 );
    229                             silk_assert( psDelDec[ i ].RD_Q10 >= 0 );
    230                         }
    231                     }
    232 
    233                     /* Copy final part of signals from winner state to output and long-term filter states */
    234                     psDD = &psDelDec[ Winner_ind ];
    235                     last_smple_idx = smpl_buf_idx + decisionDelay;
    236                     for( i = 0; i < decisionDelay; i++ ) {
    237                         last_smple_idx = ( last_smple_idx - 1 ) % DECISION_DELAY;
    238                         if( last_smple_idx < 0 ) last_smple_idx += DECISION_DELAY;
    239                         pulses[   i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 );
    240                         pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
    241                             silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], Gains_Q16[ 1 ] ), 14 ) );
    242                         NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDD->Shape_Q14[ last_smple_idx ];
    243                     }
    244 
    245                     subfr = 0;
    246                 }
    247 
    248                 /* Rewhiten with new A coefs */
    249                 start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
    250                 silk_assert( start_idx > 0 );
    251 
    252                 silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ],
    253                     A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch );
    254 
    255                 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
    256                 NSQ->rewhite_flag = 1;
    257             }
    258         }
    259 
    260         silk_nsq_del_dec_scale_states_sse4_1( psEncC, NSQ, psDelDec, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k,
    261             psEncC->nStatesDelayedDecision, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay );
    262 
    263         silk_noise_shape_quantizer_del_dec_sse4_1( NSQ, psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15,
    264             delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ],
    265             Gains_Q16[ k ], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder,
    266             psEncC->predictLPCOrder, psEncC->warping_Q16, psEncC->nStatesDelayedDecision, &smpl_buf_idx, decisionDelay );
    267 
    268         x_Q3   += psEncC->subfr_length;
    269         pulses += psEncC->subfr_length;
    270         pxq    += psEncC->subfr_length;
    271     }
    272 
    273     /* Find winner */
    274     RDmin_Q10 = psDelDec[ 0 ].RD_Q10;
    275     Winner_ind = 0;
    276     for( k = 1; k < psEncC->nStatesDelayedDecision; k++ ) {
    277         if( psDelDec[ k ].RD_Q10 < RDmin_Q10 ) {
    278             RDmin_Q10 = psDelDec[ k ].RD_Q10;
    279             Winner_ind = k;
    280         }
    281     }
    282 
    283     /* Copy final part of signals from winner state to output and long-term filter states */
    284     psDD = &psDelDec[ Winner_ind ];
    285     psIndices->Seed = psDD->SeedInit;
    286     last_smple_idx = smpl_buf_idx + decisionDelay;
    287     Gain_Q10 = silk_RSHIFT32( Gains_Q16[ psEncC->nb_subfr - 1 ], 6 );
    288     for( i = 0; i < decisionDelay; i++ ) {
    289         last_smple_idx = ( last_smple_idx - 1 ) % DECISION_DELAY;
    290         if( last_smple_idx < 0 ) last_smple_idx += DECISION_DELAY;
    291         pulses[   i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 );
    292         pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
    293             silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], Gain_Q10 ), 8 ) );
    294         NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDD->Shape_Q14[ last_smple_idx ];
    295     }
    296     silk_memcpy( NSQ->sLPC_Q14, &psDD->sLPC_Q14[ psEncC->subfr_length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
    297     silk_memcpy( NSQ->sAR2_Q14, psDD->sAR2_Q14, sizeof( psDD->sAR2_Q14 ) );
    298 
    299     /* Update states */
    300     NSQ->sLF_AR_shp_Q14 = psDD->LF_AR_Q14;
    301     NSQ->lagPrev        = pitchL[ psEncC->nb_subfr - 1 ];
    302 
    303     /* Save quantized speech signal */
    304     silk_memmove( NSQ->xq,           &NSQ->xq[           psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
    305     silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
    306     RESTORE_STACK;
    307 }
    308 
    309 /******************************************/
    310 /* Noise shape quantizer for one subframe */
    311 /******************************************/
    312 static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
    313     silk_nsq_state      *NSQ,                   /* I/O  NSQ state                           */
    314     NSQ_del_dec_struct  psDelDec[],             /* I/O  Delayed decision states             */
    315     opus_int            signalType,             /* I    Signal type                         */
    316     const opus_int32    x_Q10[],                /* I                                        */
    317     opus_int8           pulses[],               /* O                                        */
    318     opus_int16          xq[],                   /* O                                        */
    319     opus_int32          sLTP_Q15[],             /* I/O  LTP filter state                    */
    320     opus_int32          delayedGain_Q10[],      /* I/O  Gain delay buffer                   */
    321     const opus_int16    a_Q12[],                /* I    Short term prediction coefs         */
    322     const opus_int16    b_Q14[],                /* I    Long term prediction coefs          */
    323     const opus_int16    AR_shp_Q13[],           /* I    Noise shaping coefs                 */
    324     opus_int            lag,                    /* I    Pitch lag                           */
    325     opus_int32          HarmShapeFIRPacked_Q14, /* I                                        */
    326     opus_int            Tilt_Q14,               /* I    Spectral tilt                       */
    327     opus_int32          LF_shp_Q14,             /* I                                        */
    328     opus_int32          Gain_Q16,               /* I                                        */
    329     opus_int            Lambda_Q10,             /* I                                        */
    330     opus_int            offset_Q10,             /* I                                        */
    331     opus_int            length,                 /* I    Input length                        */
    332     opus_int            subfr,                  /* I    Subframe number                     */
    333     opus_int            shapingLPCOrder,        /* I    Shaping LPC filter order            */
    334     opus_int            predictLPCOrder,        /* I    Prediction filter order             */
    335     opus_int            warping_Q16,            /* I                                        */
    336     opus_int            nStatesDelayedDecision, /* I    Number of states in decision tree   */
    337     opus_int            *smpl_buf_idx,          /* I/O  Index to newest samples in buffers  */
    338     opus_int            decisionDelay           /* I                                        */
    339 )
    340 {
    341     opus_int     i, j, k, Winner_ind, RDmin_ind, RDmax_ind, last_smple_idx;
    342     opus_int32   Winner_rand_state;
    343     opus_int32   LTP_pred_Q14, LPC_pred_Q14, n_AR_Q14, n_LTP_Q14;
    344     opus_int32   n_LF_Q14, r_Q10, rr_Q10, rd1_Q10, rd2_Q10, RDmin_Q10, RDmax_Q10;
    345     opus_int32   q1_Q0, q1_Q10, q2_Q10, exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
    346     opus_int32   tmp1, tmp2, sLF_AR_shp_Q14;
    347     opus_int32   *pred_lag_ptr, *shp_lag_ptr, *psLPC_Q14;
    348     VARDECL( NSQ_sample_pair, psSampleState );
    349     NSQ_del_dec_struct *psDD;
    350     NSQ_sample_struct  *psSS;
    351 
    352     __m128i a_Q12_0123, a_Q12_4567, a_Q12_89AB, a_Q12_CDEF;
    353     __m128i b_Q12_0123, b_sr_Q12_0123;
    354     SAVE_STACK;
    355 
    356     silk_assert( nStatesDelayedDecision > 0 );
    357     ALLOC( psSampleState, nStatesDelayedDecision, NSQ_sample_pair );
    358 
    359     shp_lag_ptr  = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
    360     pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
    361     Gain_Q10     = silk_RSHIFT( Gain_Q16, 6 );
    362 
    363     a_Q12_0123 = OP_CVTEPI16_EPI32_M64( a_Q12 );
    364     a_Q12_4567 = OP_CVTEPI16_EPI32_M64( a_Q12 + 4 );
    365 
    366     if( opus_likely( predictLPCOrder == 16 ) ) {
    367         a_Q12_89AB = OP_CVTEPI16_EPI32_M64( a_Q12 + 8 );
    368         a_Q12_CDEF = OP_CVTEPI16_EPI32_M64( a_Q12 + 12 );
    369     }
    370 
    371     if( signalType == TYPE_VOICED ){
    372         b_Q12_0123 = OP_CVTEPI16_EPI32_M64( b_Q14 );
    373         b_sr_Q12_0123 = _mm_shuffle_epi32( b_Q12_0123, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
    374     }
    375     for( i = 0; i < length; i++ ) {
    376         /* Perform common calculations used in all states */
    377 
    378         /* Long-term prediction */
    379         if( signalType == TYPE_VOICED ) {
    380             /* Unrolled loop */
    381             /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
    382             LTP_pred_Q14 = 2;
    383             {
    384                 __m128i tmpa, tmpb, pred_lag_ptr_tmp;
    385                 pred_lag_ptr_tmp    = _mm_loadu_si128( (__m128i *)(&pred_lag_ptr[ -3 ] ) );
    386                 pred_lag_ptr_tmp    = _mm_shuffle_epi32( pred_lag_ptr_tmp, 0x1B );
    387                 tmpa                = _mm_mul_epi32( pred_lag_ptr_tmp, b_Q12_0123 );
    388                 tmpa                = _mm_srli_si128( tmpa, 2 );
    389 
    390                 pred_lag_ptr_tmp = _mm_shuffle_epi32( pred_lag_ptr_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) );/* equal shift right 4 bytes */
    391                 pred_lag_ptr_tmp    = _mm_mul_epi32( pred_lag_ptr_tmp, b_sr_Q12_0123 );
    392                 pred_lag_ptr_tmp    = _mm_srli_si128( pred_lag_ptr_tmp, 2 );
    393                 pred_lag_ptr_tmp    = _mm_add_epi32( pred_lag_ptr_tmp, tmpa );
    394 
    395                 tmpb = _mm_shuffle_epi32( pred_lag_ptr_tmp, _MM_SHUFFLE( 0, 0, 3, 2 ) );/* equal shift right 8 bytes */
    396                 pred_lag_ptr_tmp    = _mm_add_epi32( pred_lag_ptr_tmp, tmpb );
    397                 LTP_pred_Q14        += _mm_cvtsi128_si32( pred_lag_ptr_tmp );
    398 
    399                 LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -4 ], b_Q14[ 4 ] );
    400                 LTP_pred_Q14 = silk_LSHIFT( LTP_pred_Q14, 1 );                          /* Q13 -> Q14 */
    401                 pred_lag_ptr++;
    402             }
    403         } else {
    404             LTP_pred_Q14 = 0;
    405         }
    406 
    407         /* Long-term shaping */
    408         if( lag > 0 ) {
    409             /* Symmetric, packed FIR coefficients */
    410             n_LTP_Q14 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
    411             n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ],                      HarmShapeFIRPacked_Q14 );
    412             n_LTP_Q14 = silk_SUB_LSHIFT32( LTP_pred_Q14, n_LTP_Q14, 2 );            /* Q12 -> Q14 */
    413             shp_lag_ptr++;
    414         } else {
    415             n_LTP_Q14 = 0;
    416         }
    417         {
    418             __m128i tmpa, tmpb, psLPC_Q14_tmp, a_Q12_tmp;
    419 
    420             for( k = 0; k < nStatesDelayedDecision; k++ ) {
    421                 /* Delayed decision state */
    422                 psDD = &psDelDec[ k ];
    423 
    424                 /* Sample state */
    425                 psSS = psSampleState[ k ];
    426 
    427                 /* Generate dither */
    428                 psDD->Seed = silk_RAND( psDD->Seed );
    429 
    430                 /* Pointer used in short term prediction and shaping */
    431                 psLPC_Q14 = &psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 1 + i ];
    432                 /* Short-term prediction */
    433                 silk_assert( predictLPCOrder == 10 || predictLPCOrder == 16 );
    434                 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
    435                 LPC_pred_Q14 = silk_RSHIFT( predictLPCOrder, 1 );
    436 
    437                 tmpb = _mm_setzero_si128();
    438 
    439                 /* step 1 */
    440                 psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -3 ] ) ); /* -3, -2 , -1, 0 */
    441                 psLPC_Q14_tmp   = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );      /* 0, -1, -2, -3 */
    442                 tmpa            = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_0123 );    /* 0, -1, -2, -3 * 0123 -> 0*0, 2*-2 */
    443 
    444                 tmpa            = _mm_srli_epi64( tmpa, 16 );
    445                 tmpb            = _mm_add_epi32( tmpb, tmpa );
    446 
    447                 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
    448                 a_Q12_tmp = _mm_shuffle_epi32( a_Q12_0123, _MM_SHUFFLE(0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
    449                 psLPC_Q14_tmp   = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp ); /* 1*-1, 3*-3 */
    450                 psLPC_Q14_tmp   = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
    451                 tmpb            = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
    452 
    453                 /* step 2 */
    454                 psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -7 ] ) );
    455                 psLPC_Q14_tmp   = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
    456                 tmpa            = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_4567 );
    457                 tmpa            = _mm_srli_epi64( tmpa, 16 );
    458                 tmpb            = _mm_add_epi32( tmpb, tmpa );
    459 
    460                 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
    461                 a_Q12_tmp = _mm_shuffle_epi32( a_Q12_4567, _MM_SHUFFLE(0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
    462                 psLPC_Q14_tmp   = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp );
    463                 psLPC_Q14_tmp   = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
    464                 tmpb            = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
    465 
    466                 if ( opus_likely( predictLPCOrder == 16 ) )
    467                 {
    468                     /* step 3 */
    469                     psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -11 ] ) );
    470                     psLPC_Q14_tmp   = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
    471                     tmpa            = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_89AB );
    472                     tmpa            = _mm_srli_epi64( tmpa, 16 );
    473                     tmpb            = _mm_add_epi32( tmpb, tmpa );
    474 
    475                     psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
    476                     a_Q12_tmp = _mm_shuffle_epi32( a_Q12_89AB, _MM_SHUFFLE(0, 3, 2, 1 ) );/* equal shift right 4 bytes */
    477                     psLPC_Q14_tmp   = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp );
    478                     psLPC_Q14_tmp   = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
    479                     tmpb            = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
    480 
    481                     /* setp 4 */
    482                     psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -15 ] ) );
    483                     psLPC_Q14_tmp   = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
    484                     tmpa            = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_CDEF );
    485                     tmpa            = _mm_srli_epi64( tmpa, 16 );
    486                     tmpb            = _mm_add_epi32( tmpb, tmpa );
    487 
    488                     psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
    489                     a_Q12_tmp = _mm_shuffle_epi32( a_Q12_CDEF, _MM_SHUFFLE(0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
    490                     psLPC_Q14_tmp   = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp );
    491                     psLPC_Q14_tmp   = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
    492                     tmpb            = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
    493 
    494                     /* add at last */
    495                     /* equal shift right 8 bytes*/
    496                     tmpa            = _mm_shuffle_epi32( tmpb, _MM_SHUFFLE( 0, 0, 3, 2 ) );
    497                     tmpb            = _mm_add_epi32( tmpb, tmpa );
    498                     LPC_pred_Q14    += _mm_cvtsi128_si32( tmpb );
    499                 }
    500                 else
    501                 {
    502                     /* add at last */
    503                     tmpa            = _mm_shuffle_epi32( tmpb, _MM_SHUFFLE( 0, 0, 3, 2 ) ); /* equal shift right 8 bytes*/
    504                     tmpb            = _mm_add_epi32( tmpb, tmpa );
    505                     LPC_pred_Q14    += _mm_cvtsi128_si32( tmpb );
    506 
    507                     LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -8 ], a_Q12[ 8 ] );
    508                     LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -9 ], a_Q12[ 9 ] );
    509                 }
    510 
    511                 LPC_pred_Q14 = silk_LSHIFT( LPC_pred_Q14, 4 ); /* Q10 -> Q14 */
    512 
    513                 /* Noise shape feedback */
    514                 silk_assert( ( shapingLPCOrder & 1 ) == 0 );   /* check that order is even */
    515                 /* Output of lowpass section */
    516                 tmp2 = silk_SMLAWB( psLPC_Q14[ 0 ], psDD->sAR2_Q14[ 0 ], warping_Q16 );
    517                 /* Output of allpass section */
    518                 tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ 0 ], psDD->sAR2_Q14[ 1 ] - tmp2, warping_Q16 );
    519                 psDD->sAR2_Q14[ 0 ] = tmp2;
    520                 n_AR_Q14 = silk_RSHIFT( shapingLPCOrder, 1 );
    521                 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp2, AR_shp_Q13[ 0 ] );
    522                 /* Loop over allpass sections */
    523                 for( j = 2; j < shapingLPCOrder; j += 2 ) {
    524                     /* Output of allpass section */
    525                     tmp2 = silk_SMLAWB( psDD->sAR2_Q14[ j - 1 ], psDD->sAR2_Q14[ j + 0 ] - tmp1, warping_Q16 );
    526                     psDD->sAR2_Q14[ j - 1 ] = tmp1;
    527                     n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp1, AR_shp_Q13[ j - 1 ] );
    528                     /* Output of allpass section */
    529                     tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ j + 0 ], psDD->sAR2_Q14[ j + 1 ] - tmp2, warping_Q16 );
    530                     psDD->sAR2_Q14[ j + 0 ] = tmp2;
    531                     n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp2, AR_shp_Q13[ j ] );
    532                 }
    533                 psDD->sAR2_Q14[ shapingLPCOrder - 1 ] = tmp1;
    534                 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp1, AR_shp_Q13[ shapingLPCOrder - 1 ] );
    535 
    536                 n_AR_Q14 = silk_LSHIFT( n_AR_Q14, 1 );                                      /* Q11 -> Q12 */
    537                 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, psDD->LF_AR_Q14, Tilt_Q14 );              /* Q12 */
    538                 n_AR_Q14 = silk_LSHIFT( n_AR_Q14, 2 );                                      /* Q12 -> Q14 */
    539 
    540                 n_LF_Q14 = silk_SMULWB( psDD->Shape_Q14[ *smpl_buf_idx ], LF_shp_Q14 );     /* Q12 */
    541                 n_LF_Q14 = silk_SMLAWT( n_LF_Q14, psDD->LF_AR_Q14, LF_shp_Q14 );            /* Q12 */
    542                 n_LF_Q14 = silk_LSHIFT( n_LF_Q14, 2 );                                      /* Q12 -> Q14 */
    543 
    544                 /* Input minus prediction plus noise feedback                       */
    545                 /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP  */
    546                 tmp1 = silk_ADD32( n_AR_Q14, n_LF_Q14 );                                    /* Q14 */
    547                 tmp2 = silk_ADD32( n_LTP_Q14, LPC_pred_Q14 );                               /* Q13 */
    548                 tmp1 = silk_SUB32( tmp2, tmp1 );                                            /* Q13 */
    549                 tmp1 = silk_RSHIFT_ROUND( tmp1, 4 );                                        /* Q10 */
    550 
    551                 r_Q10 = silk_SUB32( x_Q10[ i ], tmp1 );                                     /* residual error Q10 */
    552 
    553                 /* Flip sign depending on dither */
    554                 if ( psDD->Seed < 0 ) {
    555                     r_Q10 = -r_Q10;
    556                 }
    557                 r_Q10 = silk_LIMIT_32( r_Q10, -(31 << 10), 30 << 10 );
    558 
    559                 /* Find two quantization level candidates and measure their rate-distortion */
    560                 q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );
    561                 q1_Q0 = silk_RSHIFT( q1_Q10, 10 );
    562                 if( q1_Q0 > 0 ) {
    563                     q1_Q10  = silk_SUB32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 );
    564                     q1_Q10  = silk_ADD32( q1_Q10, offset_Q10 );
    565                     q2_Q10  = silk_ADD32( q1_Q10, 1024 );
    566                     rd1_Q10 = silk_SMULBB( q1_Q10, Lambda_Q10 );
    567                     rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 );
    568                 } else if( q1_Q0 == 0 ) {
    569                     q1_Q10  = offset_Q10;
    570                     q2_Q10  = silk_ADD32( q1_Q10, 1024 - QUANT_LEVEL_ADJUST_Q10 );
    571                     rd1_Q10 = silk_SMULBB( q1_Q10, Lambda_Q10 );
    572                     rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 );
    573                 } else if( q1_Q0 == -1 ) {
    574                     q2_Q10  = offset_Q10;
    575                     q1_Q10  = silk_SUB32( q2_Q10, 1024 - QUANT_LEVEL_ADJUST_Q10 );
    576                     rd1_Q10 = silk_SMULBB( -q1_Q10, Lambda_Q10 );
    577                     rd2_Q10 = silk_SMULBB(  q2_Q10, Lambda_Q10 );
    578                 } else {            /* q1_Q0 < -1 */
    579                     q1_Q10  = silk_ADD32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 );
    580                     q1_Q10  = silk_ADD32( q1_Q10, offset_Q10 );
    581                     q2_Q10  = silk_ADD32( q1_Q10, 1024 );
    582                     rd1_Q10 = silk_SMULBB( -q1_Q10, Lambda_Q10 );
    583                     rd2_Q10 = silk_SMULBB( -q2_Q10, Lambda_Q10 );
    584                 }
    585                 rr_Q10  = silk_SUB32( r_Q10, q1_Q10 );
    586                 rd1_Q10 = silk_RSHIFT( silk_SMLABB( rd1_Q10, rr_Q10, rr_Q10 ), 10 );
    587                 rr_Q10  = silk_SUB32( r_Q10, q2_Q10 );
    588                 rd2_Q10 = silk_RSHIFT( silk_SMLABB( rd2_Q10, rr_Q10, rr_Q10 ), 10 );
    589 
    590                 if( rd1_Q10 < rd2_Q10 ) {
    591                     psSS[ 0 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd1_Q10 );
    592                     psSS[ 1 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd2_Q10 );
    593                     psSS[ 0 ].Q_Q10  = q1_Q10;
    594                     psSS[ 1 ].Q_Q10  = q2_Q10;
    595                 } else {
    596                     psSS[ 0 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd2_Q10 );
    597                     psSS[ 1 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd1_Q10 );
    598                     psSS[ 0 ].Q_Q10  = q2_Q10;
    599                     psSS[ 1 ].Q_Q10  = q1_Q10;
    600                 }
    601 
    602                 /* Update states for best quantization */
    603 
    604                 /* Quantized excitation */
    605                 exc_Q14 = silk_LSHIFT32( psSS[ 0 ].Q_Q10, 4 );
    606                 if ( psDD->Seed < 0 ) {
    607                     exc_Q14 = -exc_Q14;
    608                 }
    609 
    610                 /* Add predictions */
    611                 LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 );
    612                 xq_Q14      = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 );
    613 
    614                 /* Update states */
    615                 sLF_AR_shp_Q14         = silk_SUB32( xq_Q14, n_AR_Q14 );
    616                 psSS[ 0 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 );
    617                 psSS[ 0 ].LF_AR_Q14    = sLF_AR_shp_Q14;
    618                 psSS[ 0 ].LPC_exc_Q14  = LPC_exc_Q14;
    619                 psSS[ 0 ].xq_Q14       = xq_Q14;
    620 
    621                 /* Update states for second best quantization */
    622 
    623                 /* Quantized excitation */
    624                 exc_Q14 = silk_LSHIFT32( psSS[ 1 ].Q_Q10, 4 );
    625                 if ( psDD->Seed < 0 ) {
    626                     exc_Q14 = -exc_Q14;
    627                 }
    628 
    629 
    630                 /* Add predictions */
    631                 LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 );
    632                 xq_Q14      = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 );
    633 
    634                 /* Update states */
    635                 sLF_AR_shp_Q14         = silk_SUB32( xq_Q14, n_AR_Q14 );
    636                 psSS[ 1 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 );
    637                 psSS[ 1 ].LF_AR_Q14    = sLF_AR_shp_Q14;
    638                 psSS[ 1 ].LPC_exc_Q14  = LPC_exc_Q14;
    639                 psSS[ 1 ].xq_Q14       = xq_Q14;
    640             }
    641         }
    642         *smpl_buf_idx  = ( *smpl_buf_idx - 1 ) % DECISION_DELAY;
    643         if( *smpl_buf_idx < 0 ) *smpl_buf_idx += DECISION_DELAY;
    644         last_smple_idx = ( *smpl_buf_idx + decisionDelay ) % DECISION_DELAY;
    645 
    646         /* Find winner */
    647         RDmin_Q10 = psSampleState[ 0 ][ 0 ].RD_Q10;
    648         Winner_ind = 0;
    649         for( k = 1; k < nStatesDelayedDecision; k++ ) {
    650             if( psSampleState[ k ][ 0 ].RD_Q10 < RDmin_Q10 ) {
    651                 RDmin_Q10  = psSampleState[ k ][ 0 ].RD_Q10;
    652                 Winner_ind = k;
    653             }
    654         }
    655 
    656         /* Increase RD values of expired states */
    657         Winner_rand_state = psDelDec[ Winner_ind ].RandState[ last_smple_idx ];
    658         for( k = 0; k < nStatesDelayedDecision; k++ ) {
    659             if( psDelDec[ k ].RandState[ last_smple_idx ] != Winner_rand_state ) {
    660                 psSampleState[ k ][ 0 ].RD_Q10 = silk_ADD32( psSampleState[ k ][ 0 ].RD_Q10, silk_int32_MAX >> 4 );
    661                 psSampleState[ k ][ 1 ].RD_Q10 = silk_ADD32( psSampleState[ k ][ 1 ].RD_Q10, silk_int32_MAX >> 4 );
    662                 silk_assert( psSampleState[ k ][ 0 ].RD_Q10 >= 0 );
    663             }
    664         }
    665 
    666         /* Find worst in first set and best in second set */
    667         RDmax_Q10  = psSampleState[ 0 ][ 0 ].RD_Q10;
    668         RDmin_Q10  = psSampleState[ 0 ][ 1 ].RD_Q10;
    669         RDmax_ind = 0;
    670         RDmin_ind = 0;
    671         for( k = 1; k < nStatesDelayedDecision; k++ ) {
    672             /* find worst in first set */
    673             if( psSampleState[ k ][ 0 ].RD_Q10 > RDmax_Q10 ) {
    674                 RDmax_Q10  = psSampleState[ k ][ 0 ].RD_Q10;
    675                 RDmax_ind = k;
    676             }
    677             /* find best in second set */
    678             if( psSampleState[ k ][ 1 ].RD_Q10 < RDmin_Q10 ) {
    679                 RDmin_Q10  = psSampleState[ k ][ 1 ].RD_Q10;
    680                 RDmin_ind = k;
    681             }
    682         }
    683 
    684         /* Replace a state if best from second set outperforms worst in first set */
    685         if( RDmin_Q10 < RDmax_Q10 ) {
    686             silk_memcpy( ( (opus_int32 *)&psDelDec[ RDmax_ind ] ) + i,
    687                          ( (opus_int32 *)&psDelDec[ RDmin_ind ] ) + i, sizeof( NSQ_del_dec_struct ) - i * sizeof( opus_int32) );
    688             silk_memcpy( &psSampleState[ RDmax_ind ][ 0 ], &psSampleState[ RDmin_ind ][ 1 ], sizeof( NSQ_sample_struct ) );
    689         }
    690 
    691         /* Write samples from winner to output and long-term filter states */
    692         psDD = &psDelDec[ Winner_ind ];
    693         if( subfr > 0 || i >= decisionDelay ) {
    694             pulses[  i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 );
    695             xq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
    696                 silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], delayedGain_Q10[ last_smple_idx ] ), 8 ) );
    697             NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay ] = psDD->Shape_Q14[ last_smple_idx ];
    698             sLTP_Q15[          NSQ->sLTP_buf_idx     - decisionDelay ] = psDD->Pred_Q15[  last_smple_idx ];
    699         }
    700         NSQ->sLTP_shp_buf_idx++;
    701         NSQ->sLTP_buf_idx++;
    702 
    703         /* Update states */
    704         for( k = 0; k < nStatesDelayedDecision; k++ ) {
    705             psDD                                     = &psDelDec[ k ];
    706             psSS                                     = &psSampleState[ k ][ 0 ];
    707             psDD->LF_AR_Q14                          = psSS->LF_AR_Q14;
    708             psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH + i ] = psSS->xq_Q14;
    709             psDD->Xq_Q14[    *smpl_buf_idx ]         = psSS->xq_Q14;
    710             psDD->Q_Q10[     *smpl_buf_idx ]         = psSS->Q_Q10;
    711             psDD->Pred_Q15[  *smpl_buf_idx ]         = silk_LSHIFT32( psSS->LPC_exc_Q14, 1 );
    712             psDD->Shape_Q14[ *smpl_buf_idx ]         = psSS->sLTP_shp_Q14;
    713             psDD->Seed                               = silk_ADD32_ovflw( psDD->Seed, silk_RSHIFT_ROUND( psSS->Q_Q10, 10 ) );
    714             psDD->RandState[ *smpl_buf_idx ]         = psDD->Seed;
    715             psDD->RD_Q10                             = psSS->RD_Q10;
    716         }
    717         delayedGain_Q10[     *smpl_buf_idx ]         = Gain_Q10;
    718     }
    719     /* Update LPC states */
    720     for( k = 0; k < nStatesDelayedDecision; k++ ) {
    721         psDD = &psDelDec[ k ];
    722         silk_memcpy( psDD->sLPC_Q14, &psDD->sLPC_Q14[ length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
    723     }
    724     RESTORE_STACK;
    725 }
    726 
    727 static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
    728     const silk_encoder_state *psEncC,               /* I    Encoder State                       */
    729     silk_nsq_state      *NSQ,                       /* I/O  NSQ state                           */
    730     NSQ_del_dec_struct  psDelDec[],                 /* I/O  Delayed decision states             */
    731     const opus_int32    x_Q3[],                     /* I    Input in Q3                         */
    732     opus_int32          x_sc_Q10[],                 /* O    Input scaled with 1/Gain in Q10     */
    733     const opus_int16    sLTP[],                     /* I    Re-whitened LTP state in Q0         */
    734     opus_int32          sLTP_Q15[],                 /* O    LTP state matching scaled input     */
    735     opus_int            subfr,                      /* I    Subframe number                     */
    736     opus_int            nStatesDelayedDecision,     /* I    Number of del dec states            */
    737     const opus_int      LTP_scale_Q14,              /* I    LTP state scaling                   */
    738     const opus_int32    Gains_Q16[ MAX_NB_SUBFR ],  /* I                                        */
    739     const opus_int      pitchL[ MAX_NB_SUBFR ],     /* I    Pitch lag                           */
    740     const opus_int      signal_type,                /* I    Signal type                         */
    741     const opus_int      decisionDelay               /* I    Decision delay                      */
    742 )
    743 {
    744     opus_int            i, k, lag;
    745     opus_int32          gain_adj_Q16, inv_gain_Q31, inv_gain_Q23;
    746     NSQ_del_dec_struct  *psDD;
    747     __m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1;
    748 
    749     lag          = pitchL[ subfr ];
    750     inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
    751 
    752     silk_assert( inv_gain_Q31 != 0 );
    753 
    754     /* Calculate gain adjustment factor */
    755     if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
    756         gain_adj_Q16 =  silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
    757     } else {
    758         gain_adj_Q16 = (opus_int32)1 << 16;
    759     }
    760 
    761     /* Scale input */
    762     inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 );
    763 
    764     /* prepare inv_gain_Q23 in packed 4 32-bits */
    765     xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23);
    766 
    767     for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) {
    768         xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) );
    769         /* equal shift right 4 bytes*/
    770         xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
    771 
    772         xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 );
    773         xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 );
    774 
    775         xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 );
    776         xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 );
    777 
    778         xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC );
    779 
    780         _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ])), xmm_x_Q3_x2x0 );
    781     }
    782 
    783     for( ; i < psEncC->subfr_length; i++ ) {
    784         x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 );
    785     }
    786 
    787     /* Save inverse gain */
    788     NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
    789 
    790     /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
    791     if( NSQ->rewhite_flag ) {
    792         if( subfr == 0 ) {
    793             /* Do LTP downscaling */
    794             inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, LTP_scale_Q14 ), 2 );
    795         }
    796         for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) {
    797             silk_assert( i < MAX_FRAME_LENGTH );
    798             sLTP_Q15[ i ] = silk_SMULWB( inv_gain_Q31, sLTP[ i ] );
    799         }
    800     }
    801 
    802     /* Adjust for changing gain */
    803     if( gain_adj_Q16 != (opus_int32)1 << 16 ) {
    804         /* Scale long-term shaping state */
    805         {
    806             __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;
    807 
    808             /* prepare gain_adj_Q16 in packed 4 32-bits */
    809             xmm_gain_adj_Q16 = _mm_set1_epi32( gain_adj_Q16 );
    810 
    811             for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 3; i += 4 )
    812             {
    813                 xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ) );
    814                 /* equal shift right 4 bytes*/
    815                 xmm_sLTP_shp_Q14_x3x1 = _mm_shuffle_epi32( xmm_sLTP_shp_Q14_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
    816 
    817                 xmm_sLTP_shp_Q14_x2x0 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x2x0, xmm_gain_adj_Q16 );
    818                 xmm_sLTP_shp_Q14_x3x1 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x3x1, xmm_gain_adj_Q16 );
    819 
    820                 xmm_sLTP_shp_Q14_x2x0 = _mm_srli_epi64( xmm_sLTP_shp_Q14_x2x0, 16 );
    821                 xmm_sLTP_shp_Q14_x3x1 = _mm_slli_epi64( xmm_sLTP_shp_Q14_x3x1, 16 );
    822 
    823                 xmm_sLTP_shp_Q14_x2x0 = _mm_blend_epi16( xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1, 0xCC );
    824 
    825                 _mm_storeu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 );
    826             }
    827 
    828             for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {
    829                 NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] );
    830             }
    831 
    832             /* Scale long-term prediction state */
    833             if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) {
    834                 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay; i++ ) {
    835                     sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] );
    836                 }
    837             }
    838 
    839             for( k = 0; k < nStatesDelayedDecision; k++ ) {
    840                 psDD = &psDelDec[ k ];
    841 
    842                 /* Scale scalar states */
    843                 psDD->LF_AR_Q14 = silk_SMULWW( gain_adj_Q16, psDD->LF_AR_Q14 );
    844 
    845                 /* Scale short-term prediction and shaping states */
    846                 for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
    847                     psDD->sLPC_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->sLPC_Q14[ i ] );
    848                 }
    849                 for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
    850                     psDD->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->sAR2_Q14[ i ] );
    851                 }
    852                 for( i = 0; i < DECISION_DELAY; i++ ) {
    853                     psDD->Pred_Q15[  i ] = silk_SMULWW( gain_adj_Q16, psDD->Pred_Q15[  i ] );
    854                     psDD->Shape_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->Shape_Q14[ i ] );
    855                 }
    856             }
    857         }
    858     }
    859 }
    860