1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // Reference implementation of the preprocessing pipeline, with the same 17 // results as the audio tutorial at 18 // https://www.tensorflow.org/tutorials/sequences/audio_recognition 19 // This module takes 30ms of PCM-encoded signed 16-bit audio samples (at 16KHz, 20 // so 480 values), and extracts a power spectrum of frequencies. There are 43 21 // frequency bands in the result, derived from the original 256 output from the 22 // discrete Fourier transform, and averaged together in groups of 6. 23 // It's expected that most platforms will have optimized versions of the 24 // functions used here, for example replacing the DFT with an FFT, so this 25 // version shouldn't be used where performance is critical. 26 27 #include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.h" 28 29 #include <cmath> 30 31 #include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_model_settings.h" 32 33 namespace { 34 35 // Needed because some platforms don't have M_PI defined. 36 constexpr float kPi = 3.14159265358979323846f; 37 38 // Performs a discrete Fourier transform on the real inputs. This corresponds to 39 // rdft() in the FFT package at http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html, 40 // and to kiss_fftr() in KISSFFT at https://github.com/mborgerding/kissfft. 41 // It takes in an array of float real values, and returns a result of the same 42 // length with float real and imaginary components interleaved, so 43 // fourier_output[0] is the first real value, fourier_output[1] is the first 44 // imaginary, fourier_output[2] is the second real, and so on. 45 // The calling function should ensure that the array passed in as fourier_output 46 // is at least time_series_size in length. Most optimized FFT implementations 47 // require the length to be a power of two as well, but this version doesn't 48 // enforce that. 49 void CalculateDiscreteFourierTransform(float* time_series, int time_series_size, 50 float* fourier_output) { 51 for (int i = 0; i < time_series_size / 2; ++i) { 52 float real = 0; 53 for (int j = 0; j < time_series_size; ++j) { 54 real += time_series[j] * cos(j * i * kPi * 2 / time_series_size); 55 } 56 float imaginary = 0; 57 for (int j = 0; j < time_series_size; ++j) { 58 imaginary -= time_series[j] * sin(j * i * kPi * 2 / time_series_size); 59 } 60 fourier_output[(i * 2) + 0] = real; 61 fourier_output[(i * 2) + 1] = imaginary; 62 } 63 } 64 65 // Produces a simple sine curve that is used to ensure frequencies at the center 66 // of the current sample window are weighted more heavily than those at the end. 67 void CalculatePeriodicHann(int window_length, float* window_function) { 68 for (int i = 0; i < window_length; ++i) { 69 window_function[i] = 0.5 - 0.5 * cos((2 * kPi * i) / window_length); 70 } 71 } 72 73 } // namespace 74 75 TfLiteStatus GenerateSimpleFeatures(tflite::ErrorReporter* error_reporter, 76 const int16_t* input, int input_size, 77 int output_size, uint8_t* output) { 78 // Ensure our input and output data arrays are valid. 79 if (input_size > kMaxAudioSampleSize) { 80 error_reporter->Report("Input size %d larger than %d", input_size, 81 kMaxAudioSampleSize); 82 return kTfLiteError; 83 } 84 if (output_size != kFeatureSliceSize) { 85 error_reporter->Report("Requested output size %d doesn't match %d", 86 output_size, kFeatureSliceSize); 87 return kTfLiteError; 88 } 89 90 // Pre-calculate the window function we'll be applying to the input data. 91 // In a real application, we'd calculate this table once in an initialization 92 // function and store it for repeated reuse. 93 float window_function[kMaxAudioSampleSize]; 94 CalculatePeriodicHann(input_size, window_function); 95 96 // Apply the window function to our time series input, and pad it with zeroes 97 // to the next power of two. 98 float float_input[kMaxAudioSampleSize]; 99 for (int i = 0; i < kMaxAudioSampleSize; ++i) { 100 if (i < input_size) { 101 float_input[i] = 102 (input[i] * window_function[i]) / static_cast<float>(1 << 15); 103 } else { 104 float_input[i] = 0.0f; 105 } 106 } 107 108 // Pull the frequency data from the time series sample. 109 float fourier_values[kMaxAudioSampleSize]; 110 CalculateDiscreteFourierTransform(float_input, kMaxAudioSampleSize, 111 fourier_values); 112 113 // We have the complex numbers giving us information about each frequency 114 // band, but all we want to know is how strong each frequency is, so calculate 115 // the squared magnitude by adding together the squares of each component. 116 float power_spectrum[kMaxAudioSampleSize / 2]; 117 for (int i = 0; i < (kMaxAudioSampleSize / 2); ++i) { 118 const float real = fourier_values[(i * 2) + 0]; 119 const float imaginary = fourier_values[(i * 2) + 1]; 120 power_spectrum[i] = (real * real) + (imaginary * imaginary); 121 } 122 123 // Finally, reduce the size of the output by averaging together six adjacent 124 // frequencies into each slot, producing an array of 43 values. 125 for (int i = 0; i < kFeatureSliceSize; ++i) { 126 float total = 0.0f; 127 for (int j = 0; j < kAverageWindowSize; ++j) { 128 const int index = (i * kAverageWindowSize) + j; 129 if (index < (kMaxAudioSampleSize / 2)) { 130 total += power_spectrum[index]; 131 } 132 } 133 const float average = total / kAverageWindowSize; 134 // Quantize the result into eight bits, effectively multiplying by two. 135 // The 127.5 constant here has to match the features_max value defined in 136 // tensorflow/examples/speech_commands/input_data.py, and this also assumes 137 // that features_min is zero. It it wasn't, we'd have to subtract it first. 138 int quantized_average = roundf(average * (255.0f / 127.5f)); 139 if (quantized_average < 0) { 140 quantized_average = 0; 141 } 142 if (quantized_average > 255) { 143 quantized_average = 255; 144 } 145 output[i] = quantized_average; 146 } 147 return kTfLiteOk; 148 } 149