fix/source/lpc_masking_model_mips.c

/*
 *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */

#include "webrtc/modules/audio_coding/codecs/isac/fix/source/lpc_masking_model.h"

// MIPS DSPR2 optimization for function WebRtcIsacfix_CalculateResidualEnergy
// Bit-exact with WebRtcIsacfix_CalculateResidualEnergyC from file
// lpc_masking_model.c
int32_t WebRtcIsacfix_CalculateResidualEnergyMIPS(int lpc_order,
                                                  int32_t q_val_corr,
                                                  int q_val_polynomial,
                                                  int16_t* a_polynomial,
                                                  int32_t* corr_coeffs,
                                                  int* q_val_residual_energy) {

  int i = 0, j = 0;
  int shift_internal = 0, shift_norm = 0;
  int32_t tmp32 = 0, word32_high = 0, word32_low = 0, residual_energy = 0;
  int32_t tmp_corr_c = corr_coeffs[0];
  int16_t* tmp_a_poly = &a_polynomial[0];
  int32_t sum64_hi = 0;
  int32_t sum64_lo = 0;

  for (j = 0; j <= lpc_order; j++) {
    // For the case of i == 0:
    //   residual_energy +=
    //     a_polynomial[j] * corr_coeffs[i] * a_polynomial[j - i];

    int32_t tmp2, tmp3;
    int16_t sign_1;
    int16_t sign_2;
    int16_t sign_3;

    __asm __volatile (
      ".set      push                                                \n\t"
      ".set      noreorder                                           \n\t"
      "lh        %[tmp2],         0(%[tmp_a_poly])                   \n\t"
      "mul       %[tmp32],        %[tmp2],            %[tmp2]        \n\t"
      "addiu     %[tmp_a_poly],   %[tmp_a_poly],      2              \n\t"
      "sra       %[sign_2],       %[sum64_hi],        31             \n\t"
      "mult      $ac0,            %[tmp32],           %[tmp_corr_c]  \n\t"
      "shilov    $ac0,            %[shift_internal]                  \n\t"
      "mfhi      %[tmp2],         $ac0                               \n\t"
      "mflo      %[tmp3],         $ac0                               \n\t"
      "sra       %[sign_1],       %[tmp2],            31             \n\t"
      "xor       %[sign_3],       %[sign_1],          %[sign_2]      \n\t"
      ".set      pop                                                 \n\t"
      : [tmp2] "=&r" (tmp2), [tmp3] "=&r" (tmp3), [tmp32] "=&r" (tmp32),
        [tmp_a_poly] "+r" (tmp_a_poly), [sign_1] "=&r" (sign_1),
        [sign_3] "=&r" (sign_3), [sign_2] "=&r" (sign_2),
        [sum64_hi] "+r" (sum64_hi), [sum64_lo] "+r" (sum64_lo)
      : [tmp_corr_c] "r" (tmp_corr_c), [shift_internal] "r" (shift_internal)
      : "hi", "lo", "memory"
    );

    if (sign_3 != 0) {
      __asm __volatile (
        ".set      push                                      \n\t"
        ".set      noreorder                                 \n\t"
        "addsc     %[sum64_lo],   %[sum64_lo],    %[tmp3]    \n\t"
        "addwc     %[sum64_hi],   %[sum64_hi],    %[tmp2]    \n\t"
        ".set      pop                                       \n\t"
        : [sum64_hi] "+r" (sum64_hi), [sum64_lo] "+r" (sum64_lo)
        : [tmp2] "r" (tmp2), [tmp3] "r" (tmp3)
        : "hi", "lo", "memory"
      );
    } else {
      if (((!(sign_1 || sign_2)) && (0x7FFFFFFF - sum64_hi < tmp2)) ||
          ((sign_1 && sign_2) && (sum64_hi + tmp2 > 0))) {
        // Shift right for overflow.
        __asm __volatile (
          ".set      push                                             \n\t"
          ".set      noreorder                                        \n\t"
          "addiu     %[shift_internal], %[shift_internal],  1         \n\t"
          "prepend   %[sum64_lo],       %[sum64_hi],        1         \n\t"
          "sra       %[sum64_hi],       %[sum64_hi],        1         \n\t"
          "prepend   %[tmp3],           %[tmp2],            1         \n\t"
          "sra       %[tmp2],           %[tmp2],            1         \n\t"
          "addsc     %[sum64_lo],       %[sum64_lo],        %[tmp3]   \n\t"
          "addwc     %[sum64_hi],       %[sum64_hi],        %[tmp2]   \n\t"
          ".set      pop                                              \n\t"
          : [tmp2] "+r" (tmp2), [tmp3] "+r" (tmp3),
            [shift_internal] "+r" (shift_internal),
            [sum64_hi] "+r" (sum64_hi), [sum64_lo] "+r" (sum64_lo)
          :
          : "hi", "lo", "memory"
        );
      } else {
        __asm __volatile (
          ".set      push                                      \n\t"
          ".set      noreorder                                 \n\t"
          "addsc     %[sum64_lo],   %[sum64_lo],    %[tmp3]    \n\t"
          "addwc     %[sum64_hi],   %[sum64_hi],    %[tmp2]    \n\t"
          ".set      pop                                       \n\t"
          : [sum64_hi] "+r" (sum64_hi), [sum64_lo] "+r" (sum64_lo)
          : [tmp2] "r" (tmp2), [tmp3] "r" (tmp3)
          : "hi", "lo", "memory"
        );
      }
    }
  }

  for (i = 1; i <= lpc_order; i++) {
    tmp_corr_c = corr_coeffs[i];
    int16_t* tmp_a_poly_j = &a_polynomial[i];
    int16_t* tmp_a_poly_j_i = &a_polynomial[0];
    for (j = i; j <= lpc_order; j++) {
      // For the case of i = 1 .. lpc_order:
      //   residual_energy +=
      //     a_polynomial[j] * corr_coeffs[i] * a_polynomial[j - i] * 2;

      int32_t tmp2, tmp3;
      int16_t sign_1;
      int16_t sign_2;
      int16_t sign_3;

      __asm __volatile (
        ".set      push                                                   \n\t"
        ".set      noreorder                                              \n\t"
        "lh        %[tmp3],           0(%[tmp_a_poly_j])                  \n\t"
        "lh        %[tmp2],           0(%[tmp_a_poly_j_i])                \n\t"
        "addiu     %[tmp_a_poly_j],   %[tmp_a_poly_j],    2               \n\t"
        "addiu     %[tmp_a_poly_j_i], %[tmp_a_poly_j_i],  2               \n\t"
        "mul       %[tmp32],          %[tmp3],            %[tmp2]         \n\t"
        "sll       %[tmp32],          %[tmp32],           1               \n\t"
        "mult      $ac0,              %[tmp32],           %[tmp_corr_c]   \n\t"
        "shilov    $ac0,              %[shift_internal]                   \n\t"
        "mfhi      %[tmp2],           $ac0                                \n\t"
        "mflo      %[tmp3],           $ac0                                \n\t"
        "sra       %[sign_1],         %[tmp2],            31              \n\t"
        "sra       %[sign_2],         %[sum64_hi],        31              \n\t"
        "xor       %[sign_3],         %[sign_1],          %[sign_2]       \n\t"
        ".set      pop                                                    \n\t"
        : [tmp2] "=&r" (tmp2), [tmp3] "=&r" (tmp3), [tmp32] "=&r" (tmp32),
          [tmp_a_poly_j] "+r" (tmp_a_poly_j), [sign_1] "=&r" (sign_1),
          [tmp_a_poly_j_i] "+r" (tmp_a_poly_j_i), [sign_2] "=&r" (sign_2),
          [sign_3] "=&r" (sign_3), [sum64_hi] "+r" (sum64_hi),
          [sum64_lo] "+r" (sum64_lo)
        : [tmp_corr_c] "r" (tmp_corr_c), [shift_internal] "r" (shift_internal)
        : "hi", "lo", "memory"
      );
      if (sign_3 != 0) {
        __asm __volatile (
          ".set      push                                     \n\t"
          ".set      noreorder                                \n\t"
          "addsc     %[sum64_lo],   %[sum64_lo],   %[tmp3]    \n\t"
          "addwc     %[sum64_hi],   %[sum64_hi],   %[tmp2]    \n\t"
          ".set      pop                                      \n\t"
          : [tmp2] "+r" (tmp2), [tmp3] "+r" (tmp3), [sum64_hi] "+r" (sum64_hi),
            [sum64_lo] "+r" (sum64_lo)
          :
          :"memory"
        );
      } else {
        // Test overflow and sum the result.
        if (((!(sign_1 || sign_2)) && (0x7FFFFFFF - sum64_hi < tmp2)) ||
            ((sign_1 && sign_2) && (sum64_hi + tmp2 > 0))) {
          // Shift right for overflow.
          __asm __volatile (
            ".set      push                                              \n\t"
            ".set      noreorder                                         \n\t"
            "addiu     %[shift_internal],  %[shift_internal],  1         \n\t"
            "prepend   %[sum64_lo],        %[sum64_hi],        1         \n\t"
            "sra       %[sum64_hi],        %[sum64_hi],        1         \n\t"
            "prepend   %[tmp3],            %[tmp2],            1         \n\t"
            "sra       %[tmp2],            %[tmp2],            1         \n\t"
            "addsc     %[sum64_lo],        %[sum64_lo],        %[tmp3]   \n\t"
            "addwc     %[sum64_hi],        %[sum64_hi],        %[tmp2]   \n\t"
            ".set      pop                                               \n\t"
            : [tmp2] "+r" (tmp2), [tmp3] "+r" (tmp3),
              [shift_internal] "+r" (shift_internal),
              [sum64_hi] "+r" (sum64_hi), [sum64_lo] "+r" (sum64_lo)
            :
            : "hi", "lo", "memory"
          );
        } else {
          __asm __volatile (
            ".set      push                                      \n\t"
            ".set      noreorder                                 \n\t"
            "addsc     %[sum64_lo],    %[sum64_lo],   %[tmp3]    \n\t"
            "addwc     %[sum64_hi],    %[sum64_hi],   %[tmp2]    \n\t"
            ".set      pop                                       \n\t"
            : [tmp2] "+r" (tmp2), [tmp3] "+r" (tmp3),
              [sum64_hi] "+r" (sum64_hi), [sum64_lo] "+r" (sum64_lo)
            :
            : "hi", "lo", "memory"
          );
        }
      }
    }
  }
  word32_high = sum64_hi;
  word32_low = sum64_lo;

  // Calculate the value of shifting (shift_norm) for the 64-bit sum.
  if (word32_high != 0) {
    shift_norm = 32 - WebRtcSpl_NormW32(word32_high);
    int tmp1;
    __asm __volatile (
      ".set    push                                                     \n\t"
      ".set    noreorder                                                \n\t"
      "srl     %[residual_energy],  %[sum64_lo],         %[shift_norm]  \n\t"
      "li      %[tmp1],             32                                  \n\t"
      "subu    %[tmp1],             %[tmp1],             %[shift_norm]  \n\t"
      "sll     %[tmp1],             %[sum64_hi],         %[tmp1]        \n\t"
      "or      %[residual_energy],  %[residual_energy],  %[tmp1]        \n\t"
      ".set    pop                                                      \n\t"
      : [residual_energy] "=&r" (residual_energy), [tmp1]"=&r"(tmp1),
        [sum64_hi] "+r" (sum64_hi), [sum64_lo] "+r" (sum64_lo)
      : [shift_norm] "r" (shift_norm)
      : "memory"
    );
  } else {
    if ((word32_low & 0x80000000) != 0) {
      shift_norm = 1;
      residual_energy = (uint32_t)word32_low >> 1;
    } else {
      shift_norm = WebRtcSpl_NormW32(word32_low);
      residual_energy = word32_low << shift_norm;
      shift_norm = -shift_norm;
    }
  }

  // Q(q_val_polynomial * 2) * Q(q_val_corr) >> shift_internal >> shift_norm
  //   = Q(q_val_corr - shift_internal - shift_norm + q_val_polynomial * 2)
  *q_val_residual_energy =
      q_val_corr - shift_internal - shift_norm + q_val_polynomial * 2;

  return residual_energy;
}