1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // This code resamples the FFT bins, and smooths then with triangle-shaped 17 // weights to create a mel-frequency filter bank. For filter i centered at f_i, 18 // there is a triangular weighting of the FFT bins that extends from 19 // filter f_i-1 (with a value of zero at the left edge of the triangle) to f_i 20 // (where the filter value is 1) to f_i+1 (where the filter values returns to 21 // zero). 22 23 // Note: this code fails if you ask for too many channels. The algorithm used 24 // here assumes that each FFT bin contributes to at most two channels: the 25 // right side of a triangle for channel i, and the left side of the triangle 26 // for channel i+1. If you ask for so many channels that some of the 27 // resulting mel triangle filters are smaller than a single FFT bin, these 28 // channels may end up with no contributing FFT bins. The resulting mel 29 // spectrum output will have some channels that are always zero. 30 31 #include "tensorflow/core/kernels/mfcc_mel_filterbank.h" 32 33 #include <math.h> 34 35 #include "tensorflow/core/platform/logging.h" 36 37 namespace tensorflow { 38 39 MfccMelFilterbank::MfccMelFilterbank() : initialized_(false) {} 40 41 bool MfccMelFilterbank::Initialize(int input_length, double input_sample_rate, 42 int output_channel_count, 43 double lower_frequency_limit, 44 double upper_frequency_limit) { 45 num_channels_ = output_channel_count; 46 sample_rate_ = input_sample_rate; 47 input_length_ = input_length; 48 49 if (num_channels_ < 1) { 50 LOG(ERROR) << "Number of filterbank channels must be positive."; 51 return false; 52 } 53 54 if (sample_rate_ <= 0) { 55 LOG(ERROR) << "Sample rate must be positive."; 56 return false; 57 } 58 59 if (input_length < 2) { 60 LOG(ERROR) << "Input length must greater than 1."; 61 return false; 62 } 63 64 if (lower_frequency_limit < 0) { 65 LOG(ERROR) << "Lower frequency limit must be nonnegative."; 66 return false; 67 } 68 69 if (upper_frequency_limit <= lower_frequency_limit) { 70 LOG(ERROR) << "Upper frequency limit must be greater than " 71 << "lower frequency limit."; 72 return false; 73 } 74 75 // An extra center frequency is computed at the top to get the upper 76 // limit on the high side of the final triangular filter. 77 center_frequencies_.resize(num_channels_ + 1); 78 const double mel_low = FreqToMel(lower_frequency_limit); 79 const double mel_hi = FreqToMel(upper_frequency_limit); 80 const double mel_span = mel_hi - mel_low; 81 const double mel_spacing = mel_span / static_cast<double>(num_channels_ + 1); 82 for (int i = 0; i < num_channels_ + 1; ++i) { 83 center_frequencies_[i] = mel_low + (mel_spacing * (i + 1)); 84 } 85 86 // Always exclude DC; emulate HTK. 87 const double hz_per_sbin = 88 0.5 * sample_rate_ / static_cast<double>(input_length_ - 1); 89 start_index_ = static_cast<int>(1.5 + (lower_frequency_limit / hz_per_sbin)); 90 end_index_ = static_cast<int>(upper_frequency_limit / hz_per_sbin); 91 92 // Maps the input spectrum bin indices to filter bank channels/indices. For 93 // each FFT bin, band_mapper tells us which channel this bin contributes to 94 // on the right side of the triangle. Thus this bin also contributes to the 95 // left side of the next channel's triangle response. 96 band_mapper_.resize(input_length_); 97 int channel = 0; 98 for (int i = 0; i < input_length_; ++i) { 99 double melf = FreqToMel(i * hz_per_sbin); 100 if ((i < start_index_) || (i > end_index_)) { 101 band_mapper_[i] = -2; // Indicate an unused Fourier coefficient. 102 } else { 103 while ((center_frequencies_[channel] < melf) && 104 (channel < num_channels_)) { 105 ++channel; 106 } 107 band_mapper_[i] = channel - 1; // Can be == -1 108 } 109 } 110 111 // Create the weighting functions to taper the band edges. The contribution 112 // of any one FFT bin is based on its distance along the continuum between two 113 // mel-channel center frequencies. This bin contributes weights_[i] to the 114 // current channel and 1-weights_[i] to the next channel. 115 weights_.resize(input_length_); 116 for (int i = 0; i < input_length_; ++i) { 117 channel = band_mapper_[i]; 118 if ((i < start_index_) || (i > end_index_)) { 119 weights_[i] = 0.0; 120 } else { 121 if (channel >= 0) { 122 weights_[i] = 123 (center_frequencies_[channel + 1] - FreqToMel(i * hz_per_sbin)) / 124 (center_frequencies_[channel + 1] - center_frequencies_[channel]); 125 } else { 126 weights_[i] = (center_frequencies_[0] - FreqToMel(i * hz_per_sbin)) / 127 (center_frequencies_[0] - mel_low); 128 } 129 } 130 } 131 // Check the sum of FFT bin weights for every mel band to identify 132 // situations where the mel bands are so narrow that they don't get 133 // significant weight on enough (or any) FFT bins -- i.e., too many 134 // mel bands have been requested for the given FFT size. 135 std::vector<int> bad_channels; 136 for (int c = 0; c < num_channels_; ++c) { 137 float band_weights_sum = 0.0; 138 for (int i = 0; i < input_length_; ++i) { 139 if (band_mapper_[i] == c - 1) { 140 band_weights_sum += (1.0 - weights_[i]); 141 } else if (band_mapper_[i] == c) { 142 band_weights_sum += weights_[i]; 143 } 144 } 145 // The lowest mel channels have the fewest FFT bins and the lowest 146 // weights sum. But given that the target gain at the center frequency 147 // is 1.0, if the total sum of weights is 0.5, we're in bad shape. 148 if (band_weights_sum < 0.5) { 149 bad_channels.push_back(c); 150 } 151 } 152 if (!bad_channels.empty()) { 153 LOG(ERROR) << "Missing " << bad_channels.size() << " bands " 154 << " starting at " << bad_channels[0] 155 << " in mel-frequency design. " 156 << "Perhaps too many channels or " 157 << "not enough frequency resolution in spectrum. (" 158 << "input_length: " << input_length 159 << " input_sample_rate: " << input_sample_rate 160 << " output_channel_count: " << output_channel_count 161 << " lower_frequency_limit: " << lower_frequency_limit 162 << " upper_frequency_limit: " << upper_frequency_limit; 163 } 164 initialized_ = true; 165 return true; 166 } 167 168 // Compute the mel spectrum from the squared-magnitude FFT input by taking the 169 // square root, then summing FFT magnitudes under triangular integration windows 170 // whose widths increase with frequency. 171 void MfccMelFilterbank::Compute(const std::vector<double> &input, 172 std::vector<double> *output) const { 173 if (!initialized_) { 174 LOG(ERROR) << "Mel Filterbank not initialized."; 175 return; 176 } 177 178 if (input.size() <= end_index_) { 179 LOG(ERROR) << "Input too short to compute filterbank"; 180 return; 181 } 182 183 // Ensure output is right length and reset all values. 184 output->assign(num_channels_, 0.0); 185 186 for (int i = start_index_; i <= end_index_; i++) { // For each FFT bin 187 double spec_val = sqrt(input[i]); 188 double weighted = spec_val * weights_[i]; 189 int channel = band_mapper_[i]; 190 if (channel >= 0) 191 (*output)[channel] += weighted; // Right side of triangle, downward slope 192 channel++; 193 if (channel < num_channels_) 194 (*output)[channel] += spec_val - weighted; // Left side of triangle 195 } 196 } 197 198 double MfccMelFilterbank::FreqToMel(double freq) const { 199 return 1127.0 * log(1.0 + (freq / 700.0)); 200 } 201 202 } // namespace tensorflow 203