Home | History | Annotate | Download | only in kernels
      1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 // This code resamples the FFT bins, and smooths then with triangle-shaped
     17 // weights to create a mel-frequency filter bank. For filter i centered at f_i,
     18 // there is a triangular weighting of the FFT bins that extends from
     19 // filter f_i-1 (with a value of zero at the left edge of the triangle) to f_i
     20 // (where the filter value is 1) to f_i+1 (where the filter values returns to
     21 // zero).
     22 
     23 // Note: this code fails if you ask for too many channels.  The algorithm used
     24 // here assumes that each FFT bin contributes to at most two channels: the
     25 // right side of a triangle for channel i, and the left side of the triangle
     26 // for channel i+1.  If you ask for so many channels that some of the
     27 // resulting mel triangle filters are smaller than a single FFT bin, these
     28 // channels may end up with no contributing FFT bins.  The resulting mel
     29 // spectrum output will have some channels that are always zero.
     30 
     31 #include "tensorflow/core/kernels/mfcc_mel_filterbank.h"
     32 
     33 #include <math.h>
     34 
     35 #include "tensorflow/core/platform/logging.h"
     36 
     37 namespace tensorflow {
     38 
     39 MfccMelFilterbank::MfccMelFilterbank() : initialized_(false) {}
     40 
     41 bool MfccMelFilterbank::Initialize(int input_length, double input_sample_rate,
     42                                    int output_channel_count,
     43                                    double lower_frequency_limit,
     44                                    double upper_frequency_limit) {
     45   num_channels_ = output_channel_count;
     46   sample_rate_ = input_sample_rate;
     47   input_length_ = input_length;
     48 
     49   if (num_channels_ < 1) {
     50     LOG(ERROR) << "Number of filterbank channels must be positive.";
     51     return false;
     52   }
     53 
     54   if (sample_rate_ <= 0) {
     55     LOG(ERROR) << "Sample rate must be positive.";
     56     return false;
     57   }
     58 
     59   if (input_length < 2) {
     60     LOG(ERROR) << "Input length must greater than 1.";
     61     return false;
     62   }
     63 
     64   if (lower_frequency_limit < 0) {
     65     LOG(ERROR) << "Lower frequency limit must be nonnegative.";
     66     return false;
     67   }
     68 
     69   if (upper_frequency_limit <= lower_frequency_limit) {
     70     LOG(ERROR) << "Upper frequency limit must be greater than "
     71                << "lower frequency limit.";
     72     return false;
     73   }
     74 
     75   // An extra center frequency is computed at the top to get the upper
     76   // limit on the high side of the final triangular filter.
     77   center_frequencies_.resize(num_channels_ + 1);
     78   const double mel_low = FreqToMel(lower_frequency_limit);
     79   const double mel_hi = FreqToMel(upper_frequency_limit);
     80   const double mel_span = mel_hi - mel_low;
     81   const double mel_spacing = mel_span / static_cast<double>(num_channels_ + 1);
     82   for (int i = 0; i < num_channels_ + 1; ++i) {
     83     center_frequencies_[i] = mel_low + (mel_spacing * (i + 1));
     84   }
     85 
     86   // Always exclude DC; emulate HTK.
     87   const double hz_per_sbin =
     88       0.5 * sample_rate_ / static_cast<double>(input_length_ - 1);
     89   start_index_ = static_cast<int>(1.5 + (lower_frequency_limit / hz_per_sbin));
     90   end_index_ = static_cast<int>(upper_frequency_limit / hz_per_sbin);
     91 
     92   // Maps the input spectrum bin indices to filter bank channels/indices. For
     93   // each FFT bin, band_mapper tells us which channel this bin contributes to
     94   // on the right side of the triangle.  Thus this bin also contributes to the
     95   // left side of the next channel's triangle response.
     96   band_mapper_.resize(input_length_);
     97   int channel = 0;
     98   for (int i = 0; i < input_length_; ++i) {
     99     double melf = FreqToMel(i * hz_per_sbin);
    100     if ((i < start_index_) || (i > end_index_)) {
    101       band_mapper_[i] = -2;  // Indicate an unused Fourier coefficient.
    102     } else {
    103       while ((center_frequencies_[channel] < melf) &&
    104              (channel < num_channels_)) {
    105         ++channel;
    106       }
    107       band_mapper_[i] = channel - 1;  // Can be == -1
    108     }
    109   }
    110 
    111   // Create the weighting functions to taper the band edges.  The contribution
    112   // of any one FFT bin is based on its distance along the continuum between two
    113   // mel-channel center frequencies.  This bin contributes weights_[i] to the
    114   // current channel and 1-weights_[i] to the next channel.
    115   weights_.resize(input_length_);
    116   for (int i = 0; i < input_length_; ++i) {
    117     channel = band_mapper_[i];
    118     if ((i < start_index_) || (i > end_index_)) {
    119       weights_[i] = 0.0;
    120     } else {
    121       if (channel >= 0) {
    122         weights_[i] =
    123             (center_frequencies_[channel + 1] - FreqToMel(i * hz_per_sbin)) /
    124             (center_frequencies_[channel + 1] - center_frequencies_[channel]);
    125       } else {
    126         weights_[i] = (center_frequencies_[0] - FreqToMel(i * hz_per_sbin)) /
    127                       (center_frequencies_[0] - mel_low);
    128       }
    129     }
    130   }
    131   // Check the sum of FFT bin weights for every mel band to identify
    132   // situations where the mel bands are so narrow that they don't get
    133   // significant weight on enough (or any) FFT bins -- i.e., too many
    134   // mel bands have been requested for the given FFT size.
    135   std::vector<int> bad_channels;
    136   for (int c = 0; c < num_channels_; ++c) {
    137     float band_weights_sum = 0.0;
    138     for (int i = 0; i < input_length_; ++i) {
    139       if (band_mapper_[i] == c - 1) {
    140         band_weights_sum += (1.0 - weights_[i]);
    141       } else if (band_mapper_[i] == c) {
    142         band_weights_sum += weights_[i];
    143       }
    144     }
    145     // The lowest mel channels have the fewest FFT bins and the lowest
    146     // weights sum.  But given that the target gain at the center frequency
    147     // is 1.0, if the total sum of weights is 0.5, we're in bad shape.
    148     if (band_weights_sum < 0.5) {
    149       bad_channels.push_back(c);
    150     }
    151   }
    152   if (!bad_channels.empty()) {
    153     LOG(ERROR) << "Missing " << bad_channels.size() << " bands "
    154                << " starting at " << bad_channels[0]
    155                << " in mel-frequency design. "
    156                << "Perhaps too many channels or "
    157                << "not enough frequency resolution in spectrum. ("
    158                << "input_length: " << input_length
    159                << " input_sample_rate: " << input_sample_rate
    160                << " output_channel_count: " << output_channel_count
    161                << " lower_frequency_limit: " << lower_frequency_limit
    162                << " upper_frequency_limit: " << upper_frequency_limit;
    163   }
    164   initialized_ = true;
    165   return true;
    166 }
    167 
    168 // Compute the mel spectrum from the squared-magnitude FFT input by taking the
    169 // square root, then summing FFT magnitudes under triangular integration windows
    170 // whose widths increase with frequency.
    171 void MfccMelFilterbank::Compute(const std::vector<double> &input,
    172                                 std::vector<double> *output) const {
    173   if (!initialized_) {
    174     LOG(ERROR) << "Mel Filterbank not initialized.";
    175     return;
    176   }
    177 
    178   if (input.size() <= end_index_) {
    179     LOG(ERROR) << "Input too short to compute filterbank";
    180     return;
    181   }
    182 
    183   // Ensure output is right length and reset all values.
    184   output->assign(num_channels_, 0.0);
    185 
    186   for (int i = start_index_; i <= end_index_; i++) {  // For each FFT bin
    187     double spec_val = sqrt(input[i]);
    188     double weighted = spec_val * weights_[i];
    189     int channel = band_mapper_[i];
    190     if (channel >= 0)
    191       (*output)[channel] += weighted;  // Right side of triangle, downward slope
    192     channel++;
    193     if (channel < num_channels_)
    194       (*output)[channel] += spec_val - weighted;  // Left side of triangle
    195   }
    196 }
    197 
    198 double MfccMelFilterbank::FreqToMel(double freq) const {
    199   return 1127.0 * log(1.0 + (freq / 700.0));
    200 }
    201 
    202 }  // namespace tensorflow
    203