Created
May 11, 2019 20:19
-
-
Save dv1/dfeabd20fd2bcb5f702fdd12dc192c97 to your computer and use it in GitHub Desktop.
Simple example for how to perform bandlimited sample rate conversion with a polyphase filter
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <vector> | |
| #include <string> | |
| #include <cmath> | |
| #include <iostream> | |
| #include <sndfile.h> | |
| #ifdef WITH_RLIMIT | |
| #include <sys/resource.h> | |
| #endif | |
| //// Common types //// | |
| typedef float sample_type; | |
| typedef std::vector < sample_type > samples; | |
| //// Math functions //// | |
| static float const pi = 3.1415926535f; | |
| unsigned int gcd(unsigned int p_first, unsigned int p_second) | |
| { | |
| if (p_first == 0) return p_second; | |
| if (p_second == 0) return p_first; | |
| do | |
| { | |
| unsigned int temp = p_first % p_second; | |
| p_first = p_second; | |
| p_second = temp; | |
| } | |
| while (p_second != 0); | |
| return p_first; | |
| } | |
| unsigned int lcm(unsigned int p_first, unsigned int p_second) | |
| { | |
| unsigned int temp = gcd(p_first, p_second); | |
| return (temp != 0) ? (p_first * p_second / temp) : 0; | |
| } | |
| float sinc(float f) | |
| { | |
| return (f == 0.0f) ? 1.0f : (std::sin(f * pi) / (f * pi)); | |
| } | |
| float hanning_func(int n, int N) | |
| { | |
| return 0.5f * (1.0f - std::cos(2.0f * pi * float(n) / float(N - 1))); | |
| } | |
| samples compute_hanning_window(std::size_t const p_window_size) | |
| { | |
| samples window(p_window_size, 0); | |
| for (std::size_t i = 0; i < p_window_size; ++i) | |
| window[i] = hanning_func(i, p_window_size); | |
| return window; | |
| } | |
| sample_type convolve(sample_type const *p_first_samples, sample_type const *p_second_samples, std::size_t const p_num_samples) | |
| { | |
| sample_type result = 0.0f; | |
| int num_samples = int(p_num_samples); | |
| for (int first_index = 0; first_index < num_samples; ++first_index) | |
| { | |
| int second_index = num_samples - 1 - first_index; | |
| sample_type first_sample = p_first_samples[first_index]; | |
| sample_type second_sample = p_second_samples[second_index]; | |
| result += first_sample * second_sample; | |
| } | |
| return result; | |
| } | |
| samples compute_polyphase_filter_bank(std::size_t const p_upsampling_factor, std::size_t const p_downsampling_factor, std::size_t const p_filter_size) | |
| { | |
| // Create the window function that will be applied on top of the | |
| // stretched sinc. The window size equals the filter size plus the | |
| // upsampling factor, since we need to account for the extra nullsamples that | |
| // get inserted during zerostuffing. | |
| samples window = compute_hanning_window(p_filter_size * p_upsampling_factor); | |
| samples filter(window.size(), 0); | |
| // We need to create a low pass filter that cuts off frequencies at half of | |
| // either the input or the output sample rate, whichever is lower. However, | |
| // it turns out that for the filter calculation, we don't really need the | |
| // sample rates directly. Instead, we only need to "stretch" sinc by a | |
| // specific factor. A factor of 1 means no change. A factor of 2 means that | |
| // half of the original frequency range is cut off etc. | |
| // | |
| // If we only want to upsample by a factor of 2, and don't want to downsample, | |
| // then it is sufficient to stretch the sinc by a factor of 2. This is because | |
| // when we upsample, we apply zero stuffing, which creates unwantd spectral | |
| // images above the desired range. So, if for example we upsample by a factor | |
| // of 2, this means we insert 2-1 = 1 nullsample in between the original | |
| // samples (this is the zero-stuffing part). Like this: | |
| // | |
| // a b c d e f ... -> a 0 b 0 c 0 d 0 e 0 f 0 ... | |
| // | |
| // The lower 50% of the spectrum contains the original spectral image. So, we | |
| // need to get rid of anything except the lower half in the zero-stuffed signal. | |
| // | |
| // Another example: Upsampling by a factor of 3, no downsampling. Here, | |
| // we have 2 unwanted spectral images above the original one, and 3-1 = 2 | |
| // nullsamples were stuffed in between the original samples. Like this: | |
| // | |
| // a b c d ... -> a 0 0 b 0 0 c 0 0 d 0 0 ... | |
| // | |
| // Since we only want the original image, and its spectrum only makes 1/3rd | |
| // of the spectrum of the zerostuffed signal, we stretch sinc by a factor of | |
| // 3, which causes it to lowpass-filter anything except the lowest 33,3% of | |
| // the spectrum. | |
| // | |
| // If we also downsample, then it depends by how much. If the downsampling | |
| // factor is higher than the upsampling, it means that the downsampled | |
| // signal will have a Nyquist frequency that is *lower* than that of the | |
| // original signal. Example: converting from 24000 Hz to 22050 Hz. Here, | |
| // we need to lowpass-filter to make sure we get rid of all frequencies | |
| // above 22050/2 = 11025 Hz, otherwise aliasing will occur in the downsampled | |
| // signal. | |
| // | |
| // If instead we downsample to a rate that is *higher* than the original | |
| // one, we lowpass-filter with the original signal's Nyquist frequency. | |
| // If we convert from 24000 Hz to 44100 Hz, we lowpass-filter to get rid | |
| // of all frequencies above 24000 / 2 = 12000 Hz. | |
| // | |
| // Again, the actual sample rate does not matter, only the factors do. | |
| // We simply pick the higher of the two (up/downsampling factors). This | |
| // corresponds to picking the lower of the two sample rates (input/output | |
| // sample rates). | |
| float scale_factor = float(std::max(p_downsampling_factor, p_upsampling_factor)); | |
| // The polyphase filter bank is stored as a one-dimensional array. | |
| // The polyphase filters are arranged in the array as shown in | |
| // this example: | |
| // | |
| // AAABBBCCCDDD | |
| // | |
| // Where A,B,C,D are the coefficients of polyphase filters A,B,C,D. | |
| // Upsampling factor is 4 (that's why there are four filters). Each | |
| // filter has 3 taps in this example. | |
| // NOTE: It would be useful to reverse the filters, that is, | |
| // coefficients 1234 -> 4321, to make convolution easier, because | |
| // then it can be done using a simple multiply-and-add operation. | |
| // (Convolution does multiply-and-add, but with one of the inputs | |
| // reversed, so by pre-reversing one, it devolves into a simple | |
| // multiply-and-add.) | |
| std::size_t num_filters = p_upsampling_factor; | |
| std::size_t num_filter_taps = window.size() / num_filters; | |
| for (std::size_t i = 0; i < window.size(); ++i) | |
| { | |
| // Note that we do _not_ scale t by the window size here. So, we do not | |
| // normalize it in any way. This is intentional: The filter size defines | |
| // the _quality_ of the filtering. Larger filter means more sinc coefficients, | |
| // or in other words, we tap a larger range of sinc. If we normalized t, | |
| // we would always tap the _same_ range of the sinc function (the area | |
| // around the main lobe). | |
| // We do however offset t to make sure it ranges from -w/2 to w/2-1 instead of | |
| // 0 to w-1 . Otherwise we don't get a symmetric sinc tap. | |
| // TODO: Should it be from -w to +w instead? | |
| float t = int(i) - int(window.size() / 2); | |
| // Compute stretched sinc. | |
| float f = sinc(t / scale_factor); | |
| std::size_t polyphase_filter_index = i % num_filters; | |
| std::size_t offset_in_polyphase_filter = i / num_filters; | |
| std::size_t filter_array_idx = polyphase_filter_index * num_filter_taps + offset_in_polyphase_filter; | |
| // Apply the window function on top of the sinc | |
| // function to produce filter coefficients. | |
| filter[filter_array_idx] = f * window[i]; | |
| } | |
| // TODO: It is unclear why this is necessary, but without this, | |
| // the output signal may be incorrectly amplified. | |
| if (p_downsampling_factor > p_upsampling_factor) | |
| { | |
| float dampening_factor = float(p_upsampling_factor) / p_downsampling_factor; | |
| for (auto & filter_coefficient : filter) | |
| filter_coefficient *= dampening_factor; | |
| } | |
| return filter; | |
| } | |
| //// Sound I/O functionality //// | |
| class sound_file | |
| { | |
| public: | |
| sound_file() | |
| : m_sndfile(nullptr) | |
| , m_num_samples(0) | |
| , m_sample_rate(0) | |
| { | |
| } | |
| bool open_for_reading(std::string const &p_filename) | |
| { | |
| if (m_sndfile != nullptr) | |
| { | |
| std::cerr << "A file has already been opened\n"; | |
| return false; | |
| } | |
| SF_INFO info; | |
| info.format = 0; | |
| if (!open_internal(p_filename, true, info)) | |
| return false; | |
| if (info.channels != 1) | |
| { | |
| sf_close(m_sndfile); | |
| m_sndfile = nullptr; | |
| std::cerr << "File \"" << p_filename << "\" has " << info.channels << " channels; only mono is supported" << std::endl; | |
| return false; | |
| } | |
| // Technically, this is wrong, since 1 frame is a collection of N | |
| // samples, with N being the number of channels. But, since we | |
| // anyway only support mono in this example, it does not matter. | |
| m_num_samples = info.frames; | |
| m_sample_rate = info.samplerate; | |
| return true; | |
| } | |
| bool open_for_writing(std::string const &p_filename, unsigned int const p_sample_rate) | |
| { | |
| if (m_sndfile != nullptr) | |
| { | |
| std::cerr << "A file has already been opened\n"; | |
| return false; | |
| } | |
| SF_INFO info; | |
| info.format = SF_FORMAT_WAV | SF_FORMAT_FLOAT; | |
| info.samplerate = p_sample_rate; | |
| info.channels = 1; | |
| if (!open_internal(p_filename, false, info)) | |
| return false; | |
| m_num_samples = 0; | |
| m_sample_rate = p_sample_rate; | |
| return true; | |
| } | |
| ~sound_file() | |
| { | |
| if (m_sndfile != nullptr) | |
| sf_close(m_sndfile); | |
| } | |
| std::size_t get_total_num_input_samples() const | |
| { | |
| return m_num_samples; | |
| } | |
| unsigned int get_sample_rate() const | |
| { | |
| return m_sample_rate; | |
| } | |
| std::size_t read_all_samples(samples &p_samples) | |
| { | |
| return read_samples(p_samples, m_num_samples); | |
| } | |
| std::size_t read_samples(samples &p_samples, std::size_t const p_num_samples_to_read) | |
| { | |
| if (p_num_samples_to_read > p_samples.size()) | |
| p_samples.resize(p_num_samples_to_read); | |
| return sf_read_float(m_sndfile, &p_samples[0], p_num_samples_to_read); | |
| } | |
| void write_samples(samples const &p_samples, std::size_t p_num_samples_to_write) | |
| { | |
| if (p_num_samples_to_write > p_samples.size()) | |
| p_num_samples_to_write = p_samples.size(); | |
| sf_write_float(m_sndfile, &p_samples[0], p_num_samples_to_write); | |
| } | |
| private: | |
| bool open_internal(std::string const &p_filename, bool const p_read, SF_INFO &p_info) | |
| { | |
| m_sndfile = sf_open(p_filename.c_str(), p_read ? SFM_READ : SFM_WRITE, &p_info); | |
| if (m_sndfile == nullptr) | |
| { | |
| std::cerr << "Could not open file \"" << p_filename << "\" for " << (p_read ? "reading" : "writing") << std::endl; | |
| } | |
| return (m_sndfile != nullptr); | |
| } | |
| SNDFILE *m_sndfile; | |
| std::size_t m_num_samples; | |
| unsigned int m_sample_rate; | |
| }; | |
| //// main //// | |
| int main(int argc, char *argv[]) | |
| { | |
| // Get the command line arguments. | |
| if (argc < 5) | |
| { | |
| std::cerr << "Usage: " << argv[0] << " <input filename> <output WAV filename> <output sample rate> <filter size>\n"; | |
| return -1; | |
| } | |
| std::string input_filename = argv[1]; | |
| std::string output_filename = argv[2]; | |
| unsigned int output_sample_rate = std::stoul(argv[3]); | |
| std::size_t filter_size = std::stoul(argv[4]); | |
| #ifdef WITH_RLIMIT | |
| // Limit the amount of memory this process can allocate to 1 GB to | |
| // prevent lots of swap activity in case we allocate too much. Since | |
| // this example just loads the entirety of the input file to memory, | |
| // this can happen with long tracks. With this rlimit, the process | |
| // is killed as soon as the limit is hit. | |
| struct rlimit memlim; | |
| getrlimit(RLIMIT_AS, &memlim); | |
| memlim.rlim_cur = std::min(std::size_t(1024*1024*1024), memlim.rlim_cur); | |
| setrlimit(RLIMIT_AS, &memlim); | |
| #endif | |
| // Open the input file. | |
| // NOTE: In this example, we read and sample rate convert the entire | |
| // input file at once. This is for sake of clarity and simplicity. | |
| // In production, the conversion would not be done that way. Instead, | |
| // the input samples would be streamed in and converted on the fly. | |
| // This saves a ton of memory, and would also work with live streams. | |
| sound_file input_sound_file; | |
| if (!input_sound_file.open_for_reading(input_filename)) | |
| return -1; | |
| samples input_samples; | |
| input_sound_file.read_all_samples(input_samples); | |
| if (input_samples.empty()) | |
| { | |
| std::cerr << "Did not get any input samples\n"; | |
| return -1; | |
| } | |
| unsigned int input_sample_rate = input_sound_file.get_sample_rate(); | |
| // Sample rate conversion works by first interpolating the signal | |
| // to a higher sample rate, then decimating to a lower sample rate. | |
| // | |
| // Interpolation means that new samples are introduced in between | |
| // the original input samples, followed by a lowpass filter that is | |
| // applied on that augmented input signal. The signal is augmented | |
| // by inserting zeros in between the original samples. This is | |
| // called "zero-stuffing". Zero-stuffing introduces copies of the | |
| // original spectrum, _above_ the original spectrum. This is where | |
| // the low-pass filter comes in - it cuts off these unwanted copies, | |
| // leaving us with an upsampled version of the original signal. | |
| // | |
| // So, if we want to upsample the signal by an integer factor N, | |
| // we insert N-1 nullsamples between the original samples. | |
| // | |
| // Decimation can then be applied. This simply involves picking | |
| // every Mth sample, M being the decimation factor. M=1 means no | |
| // decimation. | |
| // | |
| // In short, first we upsample by N by applying interpolation, | |
| // then we downsample by M by applying decimation. The M/N ratio | |
| // is the overall sample rate conversion ratio. Upsampling factor | |
| // N is also the interpolation factor N. Downsampling factor M | |
| // is also the decimation factor M. | |
| // | |
| // The up/downsampling factors are derived from the input and | |
| // output sample rates. To that end, the least common denominator | |
| // of the sample rates is determined. That's because the up- and | |
| // downsampling factors influence filter sizes, and we want filters | |
| // to not be unnecessarily large. For example, input sample rate | |
| // 48000 Hz, output sample rate 44100 Hz, that's a ratio of | |
| // 48000/44100. We could use N=48000, M=44100, but for better | |
| // efficiency (as explained above), we reduce this and come up | |
| // with an equal ratio of 160/147. | |
| // | |
| // Once the factors are known, we could in theory get the input | |
| // signal's samples and stuff in nullsamples between these samples. | |
| // However, that would be wasteful (in the example above it would | |
| // increase the input signal size by a factor of 160). We can | |
| // make the observation that during convolution, these nullsamples | |
| // don't contribute to the output, because these nullsamples are | |
| // multiplied with filter coefficients and added. So, an equation | |
| // like: | |
| // | |
| // output = sample * coeff1 + 0 * coeff2 + 0 * coeff3 ... | |
| // | |
| // will only really be influenced by the original samples, not | |
| // by the added nullsamples. | |
| // | |
| // The idea then is to simply omit these nullsamples and only | |
| // pick the coefficients that would actually be applied (in the | |
| // example above, coeff1 would be applied, while coeff2 and | |
| // coeff3 would not). We observe that in a zerostuffed signal, | |
| // the original samples appear in every Nth position. So, in this | |
| // zerostuffed signal: | |
| // | |
| // a 0 0 b 0 0 c .. | |
| // | |
| // every 3rd sample is an original sample. If we have a filter | |
| // with 12 taps, then its coefficients can be decomposed into | |
| // subfilters. The number of filters equal the upsampling factor. | |
| // In this example, the decomposition would be: | |
| // | |
| // Original filter: h1 h2 h3 h4 h5 h6 h7 h8 h9 h10 h11 h12 | |
| // Subfilter 1: h1 h4 h7 h10 | |
| // Subfilter 2: h2 h5 h8 h11 | |
| // Subfilter 3: h3 h6 h9 h12 | |
| // | |
| // During decimation, we would normally pick a sample from the | |
| // interpolated signal. Suppose that we didn't use polyphase | |
| // filters, but instead actually did use zerostuffing and | |
| // filtered that zerostuffed signal. Suppose upsampling factor | |
| // N is 3, downsampling factor M is 2. | |
| // | |
| // Original signal: i1 i2 i3 i4 i5 i6 i7 .. | |
| // Zerostuffed signal: i1 0 0 i2 0 0 i3 0 0 i4 0 0 i5 0 0 .. | |
| // Interpolated signal: i1 a b i2 c d i3 e f i4 g h i6 i j .. | |
| // (a-j are newly interpolated samples that resulted from | |
| // convolving the filter with the zerostuffed signal) | |
| // | |
| // Decimation factor M = 2 means we pick every 2nd | |
| // interpolated sample, and get the output signal | |
| // i1 b c i3 f g i6 .. | |
| // | |
| // We optimize this by using polyphase filters, eliminating | |
| // the need for an intermediate zerostuffed signal. Instead, | |
| // we pick the subfilter that would produce the sample that | |
| // we pick during decimation. | |
| // | |
| // From the example above, suppose we are about to pick | |
| // sample "b". to compute "b", we pick the appropriate | |
| // subfilter. We find this subfilter by computing the | |
| // position in the interpolated signal. This we call the | |
| // "interpolated position". In the case of "b", this position | |
| // would be 2. (Positions start at 0.) We then apply modulo 3 | |
| // (the number of subfilters) to pick the appropriate subfilter. | |
| // That's subfilter (2 mod 3) = 2 in this case. | |
| // | |
| // Then we need the position in the original input signal that | |
| // corresponds to the "b" sample". This we get by calculating | |
| // position_in_original_signal = position_in_output_signal * M / N | |
| // So, in this case, "b" is at position #1 in the output signal, | |
| // and 1*2/3 = 0. Now we have the position in the input signal | |
| // where we get the input samples that we want to convolve with | |
| // the subfilter #2 we picked earlier. This means that this | |
| // subfilter is convolved with input samples i1 to i4. | |
| // | |
| // Another example would be if we wanted to compute sample "g", | |
| // which is at "interpolated position" position 10 and at output | |
| // position 5. 10 mod 3 = 1, and 5*2/3 = 3. So, we would convolve | |
| // input samples i4 to i7 with subfilter #1. | |
| // Compute the up- and downsampling factors. | |
| unsigned int sample_rate_lcm = lcm(input_sample_rate, output_sample_rate); | |
| // This is the interpolation factor N mentioned above. | |
| unsigned int upsampling_factor = sample_rate_lcm / input_sample_rate; | |
| // This is the decimation factor M mentioned above. | |
| unsigned int downsampling_factor = sample_rate_lcm / output_sample_rate; | |
| // Print some info. | |
| std::cerr << "Input / output sample rate: " << input_sample_rate << " Hz / " << output_sample_rate << " Hz\n"; | |
| std::cerr << "Up / downsampling factors: " << upsampling_factor << " / " << downsampling_factor << "\n"; | |
| std::cerr << "Filter size: " << filter_size << "\n"; | |
| // Prepare the polyphase filter and the output samples buffer. | |
| samples polyphase_filter_bank = compute_polyphase_filter_bank(upsampling_factor, downsampling_factor, filter_size); | |
| samples output_samples(input_samples.size() * upsampling_factor / downsampling_factor, 0); | |
| std::size_t num_polyphase_filters = upsampling_factor; | |
| std::size_t polyphase_filter_size = polyphase_filter_bank.size() / num_polyphase_filters; | |
| // Open the output file. | |
| sound_file output_sound_file; | |
| if (!output_sound_file.open_for_writing(output_filename, output_sample_rate)) | |
| return -1; | |
| // Now perform the sample rate conversion. | |
| std::size_t output_position = 0; | |
| std::size_t interpolated_position = 0; | |
| std::size_t last_progress = 0; | |
| // Don't iterate over the last filter_size samples. That's because | |
| // convolve() will access all samples from output_position to | |
| // (output_position + polyphase_filter_size - 1). | |
| for (; output_position < output_samples.size() - filter_size; ++output_position, interpolated_position += downsampling_factor) | |
| { | |
| // Print some progress. We keep track of the last computed | |
| // progress to check if we should actually print something. | |
| // Otherwise, the constant console output could actually | |
| // slow down this code (and it floods the console with | |
| // lines of course). | |
| std::size_t progress = output_position / 100000; | |
| if (progress != last_progress) | |
| { | |
| std::cerr << output_position << "/" << output_samples.size() << "(" << (float(output_position) / float(output_samples.size()) * 100.0f) << "%)\n"; | |
| last_progress = progress; | |
| } | |
| // Pick the right subfilter. | |
| std::size_t polyphase_filter_index = interpolated_position % num_polyphase_filters; | |
| sample_type const *polyphase_filter_coefficients = &(polyphase_filter_bank[polyphase_filter_index * polyphase_filter_size]); | |
| // Pick what input samples we need to convolve with the | |
| // chosen subfilter. | |
| sample_type const *input_samples_coefficients = &(input_samples[output_position * downsampling_factor / upsampling_factor]); | |
| // Perform the convolution, producing the sample rate | |
| // converted output. | |
| output_samples[output_position] = convolve(input_samples_coefficients, polyphase_filter_coefficients, polyphase_filter_size); | |
| } | |
| // Write the result. | |
| output_sound_file.write_samples(output_samples, output_samples.size()); | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment