Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save dv1/dfeabd20fd2bcb5f702fdd12dc192c97 to your computer and use it in GitHub Desktop.
Save dv1/dfeabd20fd2bcb5f702fdd12dc192c97 to your computer and use it in GitHub Desktop.
Simple example for how to perform bandlimited sample rate conversion with a polyphase filter
#include <vector>
#include <string>
#include <cmath>
#include <iostream>
#include <sndfile.h>
#ifdef WITH_RLIMIT
#include <sys/resource.h>
#endif
//// Common types ////
typedef float sample_type;
typedef std::vector < sample_type > samples;
//// Math functions ////
static float const pi = 3.1415926535f;
unsigned int gcd(unsigned int p_first, unsigned int p_second)
{
if (p_first == 0) return p_second;
if (p_second == 0) return p_first;
do
{
unsigned int temp = p_first % p_second;
p_first = p_second;
p_second = temp;
}
while (p_second != 0);
return p_first;
}
unsigned int lcm(unsigned int p_first, unsigned int p_second)
{
unsigned int temp = gcd(p_first, p_second);
return (temp != 0) ? (p_first * p_second / temp) : 0;
}
float sinc(float f)
{
return (f == 0.0f) ? 1.0f : (std::sin(f * pi) / (f * pi));
}
float hanning_func(int n, int N)
{
return 0.5f * (1.0f - std::cos(2.0f * pi * float(n) / float(N - 1)));
}
samples compute_hanning_window(std::size_t const p_window_size)
{
samples window(p_window_size, 0);
for (std::size_t i = 0; i < p_window_size; ++i)
window[i] = hanning_func(i, p_window_size);
return window;
}
sample_type convolve(sample_type const *p_first_samples, sample_type const *p_second_samples, std::size_t const p_num_samples)
{
sample_type result = 0.0f;
int num_samples = int(p_num_samples);
for (int first_index = 0; first_index < num_samples; ++first_index)
{
int second_index = num_samples - 1 - first_index;
sample_type first_sample = p_first_samples[first_index];
sample_type second_sample = p_second_samples[second_index];
result += first_sample * second_sample;
}
return result;
}
samples compute_polyphase_filter_bank(std::size_t const p_upsampling_factor, std::size_t const p_downsampling_factor, std::size_t const p_filter_size)
{
// Create the window function that will be applied on top of the
// stretched sinc. The window size equals the filter size plus the
// upsampling factor, since we need to account for the extra nullsamples that
// get inserted during zerostuffing.
samples window = compute_hanning_window(p_filter_size * p_upsampling_factor);
samples filter(window.size(), 0);
// We need to create a low pass filter that cuts off frequencies at half of
// either the input or the output sample rate, whichever is lower. However,
// it turns out that for the filter calculation, we don't really need the
// sample rates directly. Instead, we only need to "stretch" sinc by a
// specific factor. A factor of 1 means no change. A factor of 2 means that
// half of the original frequency range is cut off etc.
//
// If we only want to upsample by a factor of 2, and don't want to downsample,
// then it is sufficient to stretch the sinc by a factor of 2. This is because
// when we upsample, we apply zero stuffing, which creates unwantd spectral
// images above the desired range. So, if for example we upsample by a factor
// of 2, this means we insert 2-1 = 1 nullsample in between the original
// samples (this is the zero-stuffing part). Like this:
//
// a b c d e f ... -> a 0 b 0 c 0 d 0 e 0 f 0 ...
//
// The lower 50% of the spectrum contains the original spectral image. So, we
// need to get rid of anything except the lower half in the zero-stuffed signal.
//
// Another example: Upsampling by a factor of 3, no downsampling. Here,
// we have 2 unwanted spectral images above the original one, and 3-1 = 2
// nullsamples were stuffed in between the original samples. Like this:
//
// a b c d ... -> a 0 0 b 0 0 c 0 0 d 0 0 ...
//
// Since we only want the original image, and its spectrum only makes 1/3rd
// of the spectrum of the zerostuffed signal, we stretch sinc by a factor of
// 3, which causes it to lowpass-filter anything except the lowest 33,3% of
// the spectrum.
//
// If we also downsample, then it depends by how much. If the downsampling
// factor is higher than the upsampling, it means that the downsampled
// signal will have a Nyquist frequency that is *lower* than that of the
// original signal. Example: converting from 24000 Hz to 22050 Hz. Here,
// we need to lowpass-filter to make sure we get rid of all frequencies
// above 22050/2 = 11025 Hz, otherwise aliasing will occur in the downsampled
// signal.
//
// If instead we downsample to a rate that is *higher* than the original
// one, we lowpass-filter with the original signal's Nyquist frequency.
// If we convert from 24000 Hz to 44100 Hz, we lowpass-filter to get rid
// of all frequencies above 24000 / 2 = 12000 Hz.
//
// Again, the actual sample rate does not matter, only the factors do.
// We simply pick the higher of the two (up/downsampling factors). This
// corresponds to picking the lower of the two sample rates (input/output
// sample rates).
float scale_factor = float(std::max(p_downsampling_factor, p_upsampling_factor));
// The polyphase filter bank is stored as a one-dimensional array.
// The polyphase filters are arranged in the array as shown in
// this example:
//
// AAABBBCCCDDD
//
// Where A,B,C,D are the coefficients of polyphase filters A,B,C,D.
// Upsampling factor is 4 (that's why there are four filters). Each
// filter has 3 taps in this example.
// NOTE: It would be useful to reverse the filters, that is,
// coefficients 1234 -> 4321, to make convolution easier, because
// then it can be done using a simple multiply-and-add operation.
// (Convolution does multiply-and-add, but with one of the inputs
// reversed, so by pre-reversing one, it devolves into a simple
// multiply-and-add.)
std::size_t num_filters = p_upsampling_factor;
std::size_t num_filter_taps = window.size() / num_filters;
for (std::size_t i = 0; i < window.size(); ++i)
{
// Note that we do _not_ scale t by the window size here. So, we do not
// normalize it in any way. This is intentional: The filter size defines
// the _quality_ of the filtering. Larger filter means more sinc coefficients,
// or in other words, we tap a larger range of sinc. If we normalized t,
// we would always tap the _same_ range of the sinc function (the area
// around the main lobe).
// We do however offset t to make sure it ranges from -w/2 to w/2-1 instead of
// 0 to w-1 . Otherwise we don't get a symmetric sinc tap.
// TODO: Should it be from -w to +w instead?
float t = int(i) - int(window.size() / 2);
// Compute stretched sinc.
float f = sinc(t / scale_factor);
std::size_t polyphase_filter_index = i % num_filters;
std::size_t offset_in_polyphase_filter = i / num_filters;
std::size_t filter_array_idx = polyphase_filter_index * num_filter_taps + offset_in_polyphase_filter;
// Apply the window function on top of the sinc
// function to produce filter coefficients.
filter[filter_array_idx] = f * window[i];
}
// TODO: It is unclear why this is necessary, but without this,
// the output signal may be incorrectly amplified.
if (p_downsampling_factor > p_upsampling_factor)
{
float dampening_factor = float(p_upsampling_factor) / p_downsampling_factor;
for (auto & filter_coefficient : filter)
filter_coefficient *= dampening_factor;
}
return filter;
}
//// Sound I/O functionality ////
class sound_file
{
public:
sound_file()
: m_sndfile(nullptr)
, m_num_samples(0)
, m_sample_rate(0)
{
}
bool open_for_reading(std::string const &p_filename)
{
if (m_sndfile != nullptr)
{
std::cerr << "A file has already been opened\n";
return false;
}
SF_INFO info;
info.format = 0;
if (!open_internal(p_filename, true, info))
return false;
if (info.channels != 1)
{
sf_close(m_sndfile);
m_sndfile = nullptr;
std::cerr << "File \"" << p_filename << "\" has " << info.channels << " channels; only mono is supported" << std::endl;
return false;
}
// Technically, this is wrong, since 1 frame is a collection of N
// samples, with N being the number of channels. But, since we
// anyway only support mono in this example, it does not matter.
m_num_samples = info.frames;
m_sample_rate = info.samplerate;
return true;
}
bool open_for_writing(std::string const &p_filename, unsigned int const p_sample_rate)
{
if (m_sndfile != nullptr)
{
std::cerr << "A file has already been opened\n";
return false;
}
SF_INFO info;
info.format = SF_FORMAT_WAV | SF_FORMAT_FLOAT;
info.samplerate = p_sample_rate;
info.channels = 1;
if (!open_internal(p_filename, false, info))
return false;
m_num_samples = 0;
m_sample_rate = p_sample_rate;
return true;
}
~sound_file()
{
if (m_sndfile != nullptr)
sf_close(m_sndfile);
}
std::size_t get_total_num_input_samples() const
{
return m_num_samples;
}
unsigned int get_sample_rate() const
{
return m_sample_rate;
}
std::size_t read_all_samples(samples &p_samples)
{
return read_samples(p_samples, m_num_samples);
}
std::size_t read_samples(samples &p_samples, std::size_t const p_num_samples_to_read)
{
if (p_num_samples_to_read > p_samples.size())
p_samples.resize(p_num_samples_to_read);
return sf_read_float(m_sndfile, &p_samples[0], p_num_samples_to_read);
}
void write_samples(samples const &p_samples, std::size_t p_num_samples_to_write)
{
if (p_num_samples_to_write > p_samples.size())
p_num_samples_to_write = p_samples.size();
sf_write_float(m_sndfile, &p_samples[0], p_num_samples_to_write);
}
private:
bool open_internal(std::string const &p_filename, bool const p_read, SF_INFO &p_info)
{
m_sndfile = sf_open(p_filename.c_str(), p_read ? SFM_READ : SFM_WRITE, &p_info);
if (m_sndfile == nullptr)
{
std::cerr << "Could not open file \"" << p_filename << "\" for " << (p_read ? "reading" : "writing") << std::endl;
}
return (m_sndfile != nullptr);
}
SNDFILE *m_sndfile;
std::size_t m_num_samples;
unsigned int m_sample_rate;
};
//// main ////
int main(int argc, char *argv[])
{
// Get the command line arguments.
if (argc < 5)
{
std::cerr << "Usage: " << argv[0] << " <input filename> <output WAV filename> <output sample rate> <filter size>\n";
return -1;
}
std::string input_filename = argv[1];
std::string output_filename = argv[2];
unsigned int output_sample_rate = std::stoul(argv[3]);
std::size_t filter_size = std::stoul(argv[4]);
#ifdef WITH_RLIMIT
// Limit the amount of memory this process can allocate to 1 GB to
// prevent lots of swap activity in case we allocate too much. Since
// this example just loads the entirety of the input file to memory,
// this can happen with long tracks. With this rlimit, the process
// is killed as soon as the limit is hit.
struct rlimit memlim;
getrlimit(RLIMIT_AS, &memlim);
memlim.rlim_cur = std::min(std::size_t(1024*1024*1024), memlim.rlim_cur);
setrlimit(RLIMIT_AS, &memlim);
#endif
// Open the input file.
// NOTE: In this example, we read and sample rate convert the entire
// input file at once. This is for sake of clarity and simplicity.
// In production, the conversion would not be done that way. Instead,
// the input samples would be streamed in and converted on the fly.
// This saves a ton of memory, and would also work with live streams.
sound_file input_sound_file;
if (!input_sound_file.open_for_reading(input_filename))
return -1;
samples input_samples;
input_sound_file.read_all_samples(input_samples);
if (input_samples.empty())
{
std::cerr << "Did not get any input samples\n";
return -1;
}
unsigned int input_sample_rate = input_sound_file.get_sample_rate();
// Sample rate conversion works by first interpolating the signal
// to a higher sample rate, then decimating to a lower sample rate.
//
// Interpolation means that new samples are introduced in between
// the original input samples, followed by a lowpass filter that is
// applied on that augmented input signal. The signal is augmented
// by inserting zeros in between the original samples. This is
// called "zero-stuffing". Zero-stuffing introduces copies of the
// original spectrum, _above_ the original spectrum. This is where
// the low-pass filter comes in - it cuts off these unwanted copies,
// leaving us with an upsampled version of the original signal.
//
// So, if we want to upsample the signal by an integer factor N,
// we insert N-1 nullsamples between the original samples.
//
// Decimation can then be applied. This simply involves picking
// every Mth sample, M being the decimation factor. M=1 means no
// decimation.
//
// In short, first we upsample by N by applying interpolation,
// then we downsample by M by applying decimation. The M/N ratio
// is the overall sample rate conversion ratio. Upsampling factor
// N is also the interpolation factor N. Downsampling factor M
// is also the decimation factor M.
//
// The up/downsampling factors are derived from the input and
// output sample rates. To that end, the least common denominator
// of the sample rates is determined. That's because the up- and
// downsampling factors influence filter sizes, and we want filters
// to not be unnecessarily large. For example, input sample rate
// 48000 Hz, output sample rate 44100 Hz, that's a ratio of
// 48000/44100. We could use N=48000, M=44100, but for better
// efficiency (as explained above), we reduce this and come up
// with an equal ratio of 160/147.
//
// Once the factors are known, we could in theory get the input
// signal's samples and stuff in nullsamples between these samples.
// However, that would be wasteful (in the example above it would
// increase the input signal size by a factor of 160). We can
// make the observation that during convolution, these nullsamples
// don't contribute to the output, because these nullsamples are
// multiplied with filter coefficients and added. So, an equation
// like:
//
// output = sample * coeff1 + 0 * coeff2 + 0 * coeff3 ...
//
// will only really be influenced by the original samples, not
// by the added nullsamples.
//
// The idea then is to simply omit these nullsamples and only
// pick the coefficients that would actually be applied (in the
// example above, coeff1 would be applied, while coeff2 and
// coeff3 would not). We observe that in a zerostuffed signal,
// the original samples appear in every Nth position. So, in this
// zerostuffed signal:
//
// a 0 0 b 0 0 c ..
//
// every 3rd sample is an original sample. If we have a filter
// with 12 taps, then its coefficients can be decomposed into
// subfilters. The number of filters equal the upsampling factor.
// In this example, the decomposition would be:
//
// Original filter: h1 h2 h3 h4 h5 h6 h7 h8 h9 h10 h11 h12
// Subfilter 1: h1 h4 h7 h10
// Subfilter 2: h2 h5 h8 h11
// Subfilter 3: h3 h6 h9 h12
//
// During decimation, we would normally pick a sample from the
// interpolated signal. Suppose that we didn't use polyphase
// filters, but instead actually did use zerostuffing and
// filtered that zerostuffed signal. Suppose upsampling factor
// N is 3, downsampling factor M is 2.
//
// Original signal: i1 i2 i3 i4 i5 i6 i7 ..
// Zerostuffed signal: i1 0 0 i2 0 0 i3 0 0 i4 0 0 i5 0 0 ..
// Interpolated signal: i1 a b i2 c d i3 e f i4 g h i6 i j ..
// (a-j are newly interpolated samples that resulted from
// convolving the filter with the zerostuffed signal)
//
// Decimation factor M = 2 means we pick every 2nd
// interpolated sample, and get the output signal
// i1 b c i3 f g i6 ..
//
// We optimize this by using polyphase filters, eliminating
// the need for an intermediate zerostuffed signal. Instead,
// we pick the subfilter that would produce the sample that
// we pick during decimation.
//
// From the example above, suppose we are about to pick
// sample "b". to compute "b", we pick the appropriate
// subfilter. We find this subfilter by computing the
// position in the interpolated signal. This we call the
// "interpolated position". In the case of "b", this position
// would be 2. (Positions start at 0.) We then apply modulo 3
// (the number of subfilters) to pick the appropriate subfilter.
// That's subfilter (2 mod 3) = 2 in this case.
//
// Then we need the position in the original input signal that
// corresponds to the "b" sample". This we get by calculating
// position_in_original_signal = position_in_output_signal * M / N
// So, in this case, "b" is at position #1 in the output signal,
// and 1*2/3 = 0. Now we have the position in the input signal
// where we get the input samples that we want to convolve with
// the subfilter #2 we picked earlier. This means that this
// subfilter is convolved with input samples i1 to i4.
//
// Another example would be if we wanted to compute sample "g",
// which is at "interpolated position" position 10 and at output
// position 5. 10 mod 3 = 1, and 5*2/3 = 3. So, we would convolve
// input samples i4 to i7 with subfilter #1.
// Compute the up- and downsampling factors.
unsigned int sample_rate_lcm = lcm(input_sample_rate, output_sample_rate);
// This is the interpolation factor N mentioned above.
unsigned int upsampling_factor = sample_rate_lcm / input_sample_rate;
// This is the decimation factor M mentioned above.
unsigned int downsampling_factor = sample_rate_lcm / output_sample_rate;
// Print some info.
std::cerr << "Input / output sample rate: " << input_sample_rate << " Hz / " << output_sample_rate << " Hz\n";
std::cerr << "Up / downsampling factors: " << upsampling_factor << " / " << downsampling_factor << "\n";
std::cerr << "Filter size: " << filter_size << "\n";
// Prepare the polyphase filter and the output samples buffer.
samples polyphase_filter_bank = compute_polyphase_filter_bank(upsampling_factor, downsampling_factor, filter_size);
samples output_samples(input_samples.size() * upsampling_factor / downsampling_factor, 0);
std::size_t num_polyphase_filters = upsampling_factor;
std::size_t polyphase_filter_size = polyphase_filter_bank.size() / num_polyphase_filters;
// Open the output file.
sound_file output_sound_file;
if (!output_sound_file.open_for_writing(output_filename, output_sample_rate))
return -1;
// Now perform the sample rate conversion.
std::size_t output_position = 0;
std::size_t interpolated_position = 0;
std::size_t last_progress = 0;
// Don't iterate over the last filter_size samples. That's because
// convolve() will access all samples from output_position to
// (output_position + polyphase_filter_size - 1).
for (; output_position < output_samples.size() - filter_size; ++output_position, interpolated_position += downsampling_factor)
{
// Print some progress. We keep track of the last computed
// progress to check if we should actually print something.
// Otherwise, the constant console output could actually
// slow down this code (and it floods the console with
// lines of course).
std::size_t progress = output_position / 100000;
if (progress != last_progress)
{
std::cerr << output_position << "/" << output_samples.size() << "(" << (float(output_position) / float(output_samples.size()) * 100.0f) << "%)\n";
last_progress = progress;
}
// Pick the right subfilter.
std::size_t polyphase_filter_index = interpolated_position % num_polyphase_filters;
sample_type const *polyphase_filter_coefficients = &(polyphase_filter_bank[polyphase_filter_index * polyphase_filter_size]);
// Pick what input samples we need to convolve with the
// chosen subfilter.
sample_type const *input_samples_coefficients = &(input_samples[output_position * downsampling_factor / upsampling_factor]);
// Perform the convolution, producing the sample rate
// converted output.
output_samples[output_position] = convolve(input_samples_coefficients, polyphase_filter_coefficients, polyphase_filter_size);
}
// Write the result.
output_sound_file.write_samples(output_samples, output_samples.size());
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment