dv1 · May 11, 2019 20:19
diff --git a/sample-rate-conversion-with-polyphase-filtering-example.cpp b/sample-rate-conversion-with-polyphase-filtering-example.cpp
 #include <vector>
 #include <string>
 #include <cmath>
 #include <iostream>
 #include <sndfile.h>

 #ifdef WITH_RLIMIT
 #include <sys/resource.h>
 #endif


 //// Common types ////

 typedef float sample_type;
 typedef std::vector < sample_type > samples;


 //// Math functions ////

 static float const pi = 3.1415926535f;

 unsigned int gcd(unsigned int p_first, unsigned int p_second)
 {
 	if (p_first == 0) return p_second;
 	if (p_second == 0) return p_first;

 	do
 	{
 		unsigned int temp = p_first % p_second;
 		p_first = p_second;
 		p_second = temp;
 	}
 	while (p_second != 0);

 	return p_first;
 }

 unsigned int lcm(unsigned int p_first, unsigned int p_second)
 {
 	unsigned int temp = gcd(p_first, p_second);
 	return (temp != 0) ? (p_first * p_second / temp) : 0;
 }

 float sinc(float f)
 {
 	return (f == 0.0f) ? 1.0f : (std::sin(f * pi) / (f * pi));
 }

 float hanning_func(int n, int N)
 {
 	return 0.5f * (1.0f - std::cos(2.0f * pi * float(n) / float(N - 1)));
 }

 samples compute_hanning_window(std::size_t const p_window_size)
 {
 	samples window(p_window_size, 0);
 	for (std::size_t i = 0; i < p_window_size; ++i)
 		window[i] = hanning_func(i, p_window_size);
 	return window;
 }

 sample_type convolve(sample_type const *p_first_samples, sample_type const *p_second_samples, std::size_t const p_num_samples)
 {
 	sample_type result = 0.0f;
 	int num_samples = int(p_num_samples);

 	for (int first_index = 0; first_index < num_samples; ++first_index)
 	{
 		int second_index = num_samples - 1 - first_index;

 		sample_type first_sample = p_first_samples[first_index];
 		sample_type second_sample = p_second_samples[second_index];

 		result += first_sample * second_sample;
 	}

 	return result;
 }

 samples compute_polyphase_filter_bank(std::size_t const p_upsampling_factor, std::size_t const p_downsampling_factor, std::size_t const p_filter_size)
 {
 	// Create the window function that will be applied on top of the
 	// stretched sinc. The window size equals the filter size plus the
 	// upsampling factor, since we need to account for the extra nullsamples that
 	// get inserted during zerostuffing.
 	samples window = compute_hanning_window(p_filter_size * p_upsampling_factor);

 	samples filter(window.size(), 0);

 	// We need to create a low pass filter that cuts off frequencies at half of
 	// either the input or the output sample rate, whichever is lower. However,
 	// it turns out that for the filter calculation, we don't really need the
 	// sample rates directly. Instead, we only need to "stretch" sinc by a
 	// specific factor. A factor of 1 means no change. A factor of 2 means that
 	// half of the original frequency range is cut off etc.
 	//
 	// If we only want to upsample by a factor of 2, and don't want to downsample,
 	// then it is sufficient to stretch the sinc by a factor of 2. This is because
 	// when we upsample, we apply zero stuffing, which creates unwantd spectral
 	// images above the desired range. So, if for example we upsample by a factor
 	// of 2, this means we insert 2-1 = 1 nullsample in between the original
 	// samples (this is the zero-stuffing part). Like this:
 	// 
 	// a b c d e f ... -> a 0 b 0 c 0 d 0 e 0 f 0 ...
 	//
 	// The lower 50% of the spectrum contains the original spectral image. So, we
 	// need to get rid of anything except the lower half in the zero-stuffed signal.
 	//
 	// Another example: Upsampling by a factor of 3, no downsampling. Here,
 	// we have 2 unwanted spectral images above the original one, and 3-1 = 2
 	// nullsamples were stuffed in between the original samples. Like this:
 	//
 	// a b c d ... -> a 0 0 b 0 0 c 0 0 d 0 0 ...
 	//
 	// Since we only want the original image, and its spectrum only makes 1/3rd
 	// of the spectrum of the zerostuffed signal, we stretch sinc by a factor of
 	// 3, which causes it to lowpass-filter anything except the lowest 33,3% of
 	// the spectrum.
 	//
 	// If we also downsample, then it depends by how much. If the downsampling
 	// factor is higher than the upsampling, it means that the downsampled
 	// signal will have a Nyquist frequency that is *lower* than that of the
 	// original signal. Example: converting from 24000 Hz to 22050 Hz. Here,
 	// we need to lowpass-filter to make sure we get rid of all frequencies
 	// above 22050/2 = 11025 Hz, otherwise aliasing will occur in the downsampled
 	// signal.
 	//
 	// If instead we downsample to a rate that is *higher* than the original
 	// one, we lowpass-filter with the original signal's Nyquist frequency.
 	// If we convert from 24000 Hz to 44100 Hz, we lowpass-filter to get rid
 	// of all frequencies above 24000 / 2 = 12000 Hz.
 	//
 	// Again, the actual sample rate does not matter, only the factors do.
 	// We simply pick the higher of the two (up/downsampling factors). This
 	// corresponds to picking the lower of the two sample rates (input/output
 	// sample rates).
 	float scale_factor = float(std::max(p_downsampling_factor, p_upsampling_factor));

 	// The polyphase filter bank is stored as a one-dimensional array.
 	// The polyphase filters are arranged in the array as shown in
 	// this example:
 	//
 	//   AAABBBCCCDDD
 	//
 	// Where A,B,C,D are the coefficients of polyphase filters A,B,C,D.
 	// Upsampling factor is 4 (that's why there are four filters). Each
 	// filter has 3 taps in this example.

 	// NOTE: It would be useful to reverse the filters, that is,
 	// coefficients 1234 -> 4321, to make convolution easier, because
 	// then it can be done using a simple multiply-and-add operation.
 	// (Convolution does multiply-and-add, but with one of the inputs
 	// reversed, so by pre-reversing one, it devolves into a simple
 	// multiply-and-add.)

 	std::size_t num_filters = p_upsampling_factor;
 	std::size_t num_filter_taps = window.size() / num_filters;

 	for (std::size_t i = 0; i < window.size(); ++i)
 	{
 		// Note that we do _not_ scale t by the window size here. So, we do not
 		// normalize it in any way. This is intentional: The filter size defines
 		// the _quality_ of the filtering. Larger filter means more sinc coefficients,
 		// or in other words, we tap a larger range of sinc. If we normalized t,
 		// we would always tap the _same_ range of the sinc function (the area
 		// around the main lobe).
 		// We do however offset t to make sure it ranges from -w/2 to w/2-1 instead of
 		// 0 to w-1 . Otherwise we don't get a symmetric sinc tap.
 		// TODO: Should it be from -w to +w instead?
 		float t = int(i) - int(window.size() / 2);

 		// Compute stretched sinc.
 		float f = sinc(t / scale_factor);

 		std::size_t polyphase_filter_index = i % num_filters;
 		std::size_t offset_in_polyphase_filter = i / num_filters;
 		std::size_t filter_array_idx = polyphase_filter_index * num_filter_taps + offset_in_polyphase_filter;

 		// Apply the window function on top of the sinc
 		// function to produce filter coefficients.
 		filter[filter_array_idx] = f * window[i];
 	}

 	// TODO: It is unclear why this is necessary, but without this,
 	// the output signal may be incorrectly amplified.
 	if (p_downsampling_factor > p_upsampling_factor)
 	{
 		float dampening_factor = float(p_upsampling_factor) / p_downsampling_factor;
 		for (auto & filter_coefficient : filter)
 			filter_coefficient *= dampening_factor;
 	}

 	return filter;
 }


 //// Sound I/O functionality ////

 class sound_file
 {
 public:
 	sound_file()
 		: m_sndfile(nullptr)
 		, m_num_samples(0)
 		, m_sample_rate(0)
 	{
 	}

 	bool open_for_reading(std::string const &p_filename)
 	{
 		if (m_sndfile != nullptr)
 		{
 			std::cerr << "A file has already been opened\n";
 			return false;
 		}

 		SF_INFO info;
 		info.format = 0;

 		if (!open_internal(p_filename, true, info))
 			return false;

 		if (info.channels != 1)
 		{
 			sf_close(m_sndfile);
 			m_sndfile = nullptr;
 			std::cerr << "File \"" << p_filename << "\" has " << info.channels << " channels; only mono is supported" << std::endl;
 			return false;
 		}

 		// Technically, this is wrong, since 1 frame is a collection of N
 		// samples, with N being the number of channels. But, since we
 		// anyway only support mono in this example, it does not matter.
 		m_num_samples = info.frames;
 		m_sample_rate = info.samplerate;

 		return true;
 	}

 	bool open_for_writing(std::string const &p_filename, unsigned int const p_sample_rate)
 	{
 		if (m_sndfile != nullptr)
 		{
 			std::cerr << "A file has already been opened\n";
 			return false;
 		}

 		SF_INFO info;
 		info.format = SF_FORMAT_WAV | SF_FORMAT_FLOAT;
 		info.samplerate = p_sample_rate;
 		info.channels = 1;

 		if (!open_internal(p_filename, false, info))
 			return false;

 		m_num_samples = 0;
 		m_sample_rate = p_sample_rate;

 		return true;
 	}

 	~sound_file()
 	{
 		if (m_sndfile != nullptr)
 			sf_close(m_sndfile);
 	}

 	std::size_t get_total_num_input_samples() const
 	{
 		return m_num_samples;
 	}

 	unsigned int get_sample_rate() const
 	{
 		return m_sample_rate;
 	}

 	std::size_t read_all_samples(samples &p_samples)
 	{
 		return read_samples(p_samples, m_num_samples);
 	}

 	std::size_t read_samples(samples &p_samples, std::size_t const p_num_samples_to_read)
 	{
 		if (p_num_samples_to_read > p_samples.size())
 			p_samples.resize(p_num_samples_to_read);

 		return sf_read_float(m_sndfile, &p_samples[0], p_num_samples_to_read);
 	}

 	void write_samples(samples const &p_samples, std::size_t p_num_samples_to_write)
 	{
 		if (p_num_samples_to_write > p_samples.size())
 			p_num_samples_to_write = p_samples.size();

 		sf_write_float(m_sndfile, &p_samples[0], p_num_samples_to_write);
 	}


 private:
 	bool open_internal(std::string const &p_filename, bool const p_read, SF_INFO &p_info)
 	{
 		m_sndfile = sf_open(p_filename.c_str(), p_read ? SFM_READ : SFM_WRITE, &p_info);

 		if (m_sndfile == nullptr)
 		{
 			std::cerr << "Could not open file \"" << p_filename << "\" for " << (p_read ? "reading" : "writing") << std::endl;
 		}

 		return (m_sndfile != nullptr);
 	}

 	SNDFILE *m_sndfile;
 	std::size_t m_num_samples;
 	unsigned int m_sample_rate;
 };


 //// main ////

 int main(int argc, char *argv[])
 {
 	// Get the command line arguments.
 	if (argc < 5)
 	{
 		std::cerr << "Usage: " << argv[0] << " <input filename> <output WAV filename> <output sample rate> <filter size>\n";
 		return -1;
 	}

 	std::string input_filename = argv[1];
 	std::string output_filename = argv[2];
 	unsigned int output_sample_rate = std::stoul(argv[3]);
 	std::size_t filter_size = std::stoul(argv[4]);


 #ifdef WITH_RLIMIT
 	// Limit the amount of memory this process can allocate to 1 GB to
 	// prevent lots of swap activity in case we allocate too much. Since
 	// this example just loads the entirety of the input file to memory,
 	// this can happen with long tracks. With this rlimit, the process
 	// is killed as soon as the limit is hit.
 	struct rlimit memlim;
 	getrlimit(RLIMIT_AS, &memlim);
 	memlim.rlim_cur = std::min(std::size_t(1024*1024*1024), memlim.rlim_cur);
 	setrlimit(RLIMIT_AS, &memlim);
 #endif


 	// Open the input file.
 	// NOTE: In this example, we read and sample rate convert the entire
 	// input file at once. This is for sake of clarity and simplicity.
 	// In production, the conversion would not be done that way. Instead,
 	// the input samples would be streamed in and converted on the fly.
 	// This saves a ton of memory, and would also work with live streams.

 	sound_file input_sound_file;
 	if (!input_sound_file.open_for_reading(input_filename))
 		return -1;
 	samples input_samples;
 	input_sound_file.read_all_samples(input_samples);
 	if (input_samples.empty())
 	{
 		std::cerr << "Did not get any input samples\n";
 		return -1;
 	}

 	unsigned int input_sample_rate = input_sound_file.get_sample_rate();


 	// Sample rate conversion works by first interpolating the signal
 	// to a higher sample rate, then decimating to a lower sample rate.
 	//
 	// Interpolation means that new samples are introduced in between
 	// the original input samples, followed by a lowpass filter that is
 	// applied on that augmented input signal. The signal is augmented
 	// by inserting zeros in between the original samples. This is
 	// called "zero-stuffing". Zero-stuffing introduces copies of the
 	// original spectrum, _above_ the original spectrum. This is where
 	// the low-pass filter comes in - it cuts off these unwanted copies,
 	// leaving us with an upsampled version of the original signal.
 	//
 	// So, if we want to upsample the signal by an integer factor N,
 	// we insert N-1 nullsamples between the original samples.
 	//
 	// Decimation can then be applied. This simply involves picking
 	// every Mth sample, M being the decimation factor. M=1 means no
 	// decimation.
 	//
 	// In short, first we upsample by N by applying interpolation,
 	// then we downsample by M by applying decimation. The M/N ratio
 	// is the overall sample rate conversion ratio. Upsampling factor
 	// N is also the interpolation factor N. Downsampling factor M
 	// is also the decimation factor M.
 	//
 	// The up/downsampling factors are derived from the input and
 	// output sample rates. To that end, the least common denominator
 	// of the sample rates is determined. That's because the up- and
 	// downsampling factors influence filter sizes, and we want filters
 	// to not be unnecessarily large. For example, input sample rate
 	// 48000 Hz, output sample rate 44100 Hz, that's a ratio of
 	// 48000/44100. We could use N=48000, M=44100, but for better
 	// efficiency (as explained above), we reduce this and come up
 	// with an equal ratio of 160/147.
 	//
 	// Once the factors are known, we could in theory get the input
 	// signal's samples and stuff in nullsamples between these samples.
 	// However, that would be wasteful (in the example above it would
 	// increase the input signal size by a factor of 160). We can
 	// make the observation that during convolution, these nullsamples
 	// don't contribute to the output, because these nullsamples are
 	// multiplied with filter coefficients and added. So, an equation
 	// like:
 	//
 	//   output = sample * coeff1 + 0 * coeff2 + 0 * coeff3 ...
 	//
 	// will only really be influenced by the original samples, not
 	// by the added nullsamples.
 	//
 	// The idea then is to simply omit these nullsamples and only
 	// pick the coefficients that would actually be applied (in the
 	// example above, coeff1 would be applied, while coeff2 and
 	// coeff3 would not). We observe that in a zerostuffed signal,
 	// the original samples appear in every Nth position. So, in this
 	// zerostuffed signal:
 	//
 	// a 0 0 b 0 0 c ..
 	//
 	// every 3rd sample is an original sample. If we have a filter
 	// with 12 taps, then its coefficients can be decomposed into
 	// subfilters. The number of filters equal the upsampling factor.
 	// In this example, the decomposition would be:
 	//
 	// Original filter: h1 h2 h3 h4 h5 h6 h7 h8 h9 h10 h11 h12
 	// Subfilter 1:     h1       h4       h7       h10
 	// Subfilter 2:        h2       h5       h8        h11
 	// Subfilter 3:           h3       h6       h9         h12
 	//
 	// During decimation, we would normally pick a sample from the
 	// interpolated signal. Suppose that we didn't use polyphase
 	// filters, but instead actually did use zerostuffing and
 	// filtered that zerostuffed signal. Suppose upsampling factor
 	// N is 3, downsampling factor M is 2.
 	//
 	// Original signal: i1 i2 i3 i4 i5 i6 i7 ..
 	// Zerostuffed signal: i1 0 0 i2 0 0 i3 0 0 i4 0 0 i5 0 0 ..
 	// Interpolated signal: i1 a b i2 c d i3 e f i4 g h i6 i j ..
 	//   (a-j are newly interpolated samples that resulted from
 	//   convolving the filter with the zerostuffed signal)
 	//
 	// Decimation factor M = 2 means we pick every 2nd
 	// interpolated sample, and get the output signal
 	// i1 b c i3 f g i6 ..
 	//
 	// We optimize this by using polyphase filters, eliminating
 	// the need for an intermediate zerostuffed signal. Instead,
 	// we pick the subfilter that would produce the sample that
 	// we pick during decimation.
 	//
 	// From the example above, suppose we are about to pick
 	// sample "b". to compute "b", we pick the appropriate
 	// subfilter. We find this subfilter by computing the
 	// position in the interpolated signal. This we call the
 	// "interpolated position". In the case of "b", this position
 	// would be 2. (Positions start at 0.) We then apply modulo 3
 	// (the number of subfilters) to pick the appropriate subfilter.
 	// That's subfilter (2 mod 3) = 2 in this case.
 	//
 	// Then we need the position in the original input signal that
 	// corresponds to the "b" sample". This we get by calculating
 	// position_in_original_signal = position_in_output_signal * M / N
 	// So, in this case, "b" is at position #1 in the output signal,
 	// and 1*2/3 = 0. Now we have the position in the input signal
 	// where we get the input samples that we want to convolve with
 	// the subfilter #2 we picked earlier. This means that this
 	// subfilter is convolved with input samples i1 to i4.
 	//
 	// Another example would be if we wanted to compute sample "g",
 	// which is at "interpolated position" position 10 and at output
 	// position 5. 10 mod 3 = 1, and 5*2/3 = 3. So, we would convolve
 	// input samples i4 to i7 with subfilter #1.


 	// Compute the up- and downsampling factors.
 	unsigned int sample_rate_lcm = lcm(input_sample_rate, output_sample_rate);
 	// This is the interpolation factor N mentioned above.
 	unsigned int upsampling_factor = sample_rate_lcm / input_sample_rate;
 	// This is the decimation factor M mentioned above.
 	unsigned int downsampling_factor = sample_rate_lcm / output_sample_rate;


 	// Print some info.
 	std::cerr << "Input / output sample rate: " << input_sample_rate << " Hz / " << output_sample_rate << " Hz\n";
 	std::cerr << "Up / downsampling factors: " << upsampling_factor << " / " << downsampling_factor << "\n";
 	std::cerr << "Filter size: " << filter_size << "\n";


 	// Prepare the polyphase filter and the output samples buffer.
 	samples polyphase_filter_bank = compute_polyphase_filter_bank(upsampling_factor, downsampling_factor, filter_size);
 	samples output_samples(input_samples.size() * upsampling_factor / downsampling_factor, 0);

 	std::size_t num_polyphase_filters = upsampling_factor;
 	std::size_t polyphase_filter_size = polyphase_filter_bank.size() / num_polyphase_filters;


 	// Open the output file.
 	sound_file output_sound_file;
 	if (!output_sound_file.open_for_writing(output_filename, output_sample_rate))
 		return -1;


 	// Now perform the sample rate conversion.
 	std::size_t output_position = 0;
 	std::size_t interpolated_position = 0;
 	std::size_t last_progress = 0;
 	// Don't iterate over the last filter_size samples. That's because
 	// convolve() will access all samples from output_position to
 	// (output_position + polyphase_filter_size - 1).
 	for (; output_position < output_samples.size() - filter_size; ++output_position, interpolated_position += downsampling_factor)
 	{
 		// Print some progress. We keep track of the last computed
 		// progress to check if we should actually print something.
 		// Otherwise, the constant console output could actually
 		// slow down this code (and it floods the console with
 		// lines of course).
 		std::size_t progress = output_position / 100000;
 		if (progress != last_progress)
 		{
 			std::cerr << output_position << "/" << output_samples.size() << "(" << (float(output_position) / float(output_samples.size()) * 100.0f) << "%)\n";
 			last_progress = progress;
 		}

 		// Pick the right subfilter.
 		std::size_t polyphase_filter_index = interpolated_position % num_polyphase_filters;
 		sample_type const *polyphase_filter_coefficients = &(polyphase_filter_bank[polyphase_filter_index * polyphase_filter_size]);

 		// Pick what input samples we need to convolve with the
 		// chosen subfilter.
 		sample_type const *input_samples_coefficients = &(input_samples[output_position * downsampling_factor / upsampling_factor]);

 		// Perform the convolution, producing the sample rate
 		// converted output.
 		output_samples[output_position] = convolve(input_samples_coefficients, polyphase_filter_coefficients, polyphase_filter_size);
 	}


 	// Write the result.
 	output_sound_file.write_samples(output_samples, output_samples.size());


 	return 0;
 }
	#include <vector>
	#include <string>
	#include <cmath>
	#include <iostream>
	#include <sndfile.h>

	#ifdef WITH_RLIMIT
	#include <sys/resource.h>
	#endif


	//// Common types ////

	typedef float sample_type;
	typedef std::vector < sample_type > samples;


	//// Math functions ////

	static float const pi = 3.1415926535f;

	unsigned int gcd(unsigned int p_first, unsigned int p_second)
	{
	if (p_first == 0) return p_second;
	if (p_second == 0) return p_first;

	do
	{
	unsigned int temp = p_first % p_second;
	p_first = p_second;
	p_second = temp;
	}
	while (p_second != 0);

	return p_first;
	}

	unsigned int lcm(unsigned int p_first, unsigned int p_second)
	{
	unsigned int temp = gcd(p_first, p_second);
	return (temp != 0) ? (p_first * p_second / temp) : 0;
	}

	float sinc(float f)
	{
	return (f == 0.0f) ? 1.0f : (std::sin(f * pi) / (f * pi));
	}

	float hanning_func(int n, int N)
	{
	return 0.5f * (1.0f - std::cos(2.0f * pi * float(n) / float(N - 1)));
	}

	samples compute_hanning_window(std::size_t const p_window_size)
	{
	samples window(p_window_size, 0);
	for (std::size_t i = 0; i < p_window_size; ++i)
	window[i] = hanning_func(i, p_window_size);
	return window;
	}

	sample_type convolve(sample_type const p_first_samples, sample_type const p_second_samples, std::size_t const p_num_samples)
	{
	sample_type result = 0.0f;
	int num_samples = int(p_num_samples);

	for (int first_index = 0; first_index < num_samples; ++first_index)
	{
	int second_index = num_samples - 1 - first_index;

	sample_type first_sample = p_first_samples[first_index];
	sample_type second_sample = p_second_samples[second_index];

	result += first_sample * second_sample;
	}

	return result;
	}

	samples compute_polyphase_filter_bank(std::size_t const p_upsampling_factor, std::size_t const p_downsampling_factor, std::size_t const p_filter_size)
	{
	// Create the window function that will be applied on top of the
	// stretched sinc. The window size equals the filter size plus the
	// upsampling factor, since we need to account for the extra nullsamples that
	// get inserted during zerostuffing.
	samples window = compute_hanning_window(p_filter_size * p_upsampling_factor);

	samples filter(window.size(), 0);

	// We need to create a low pass filter that cuts off frequencies at half of
	// either the input or the output sample rate, whichever is lower. However,
	// it turns out that for the filter calculation, we don't really need the
	// sample rates directly. Instead, we only need to "stretch" sinc by a
	// specific factor. A factor of 1 means no change. A factor of 2 means that
	// half of the original frequency range is cut off etc.
	//
	// If we only want to upsample by a factor of 2, and don't want to downsample,
	// then it is sufficient to stretch the sinc by a factor of 2. This is because
	// when we upsample, we apply zero stuffing, which creates unwantd spectral
	// images above the desired range. So, if for example we upsample by a factor
	// of 2, this means we insert 2-1 = 1 nullsample in between the original
	// samples (this is the zero-stuffing part). Like this:
	//
	// a b c d e f ... -> a 0 b 0 c 0 d 0 e 0 f 0 ...
	//
	// The lower 50% of the spectrum contains the original spectral image. So, we
	// need to get rid of anything except the lower half in the zero-stuffed signal.
	//
	// Another example: Upsampling by a factor of 3, no downsampling. Here,
	// we have 2 unwanted spectral images above the original one, and 3-1 = 2
	// nullsamples were stuffed in between the original samples. Like this:
	//
	// a b c d ... -> a 0 0 b 0 0 c 0 0 d 0 0 ...
	//
	// Since we only want the original image, and its spectrum only makes 1/3rd
	// of the spectrum of the zerostuffed signal, we stretch sinc by a factor of
	// 3, which causes it to lowpass-filter anything except the lowest 33,3% of
	// the spectrum.
	//
	// If we also downsample, then it depends by how much. If the downsampling
	// factor is higher than the upsampling, it means that the downsampled
	// signal will have a Nyquist frequency that is lower than that of the
	// original signal. Example: converting from 24000 Hz to 22050 Hz. Here,
	// we need to lowpass-filter to make sure we get rid of all frequencies
	// above 22050/2 = 11025 Hz, otherwise aliasing will occur in the downsampled
	// signal.
	//
	// If instead we downsample to a rate that is higher than the original
	// one, we lowpass-filter with the original signal's Nyquist frequency.
	// If we convert from 24000 Hz to 44100 Hz, we lowpass-filter to get rid
	// of all frequencies above 24000 / 2 = 12000 Hz.
	//
	// Again, the actual sample rate does not matter, only the factors do.
	// We simply pick the higher of the two (up/downsampling factors). This
	// corresponds to picking the lower of the two sample rates (input/output
	// sample rates).
	float scale_factor = float(std::max(p_downsampling_factor, p_upsampling_factor));

	// The polyphase filter bank is stored as a one-dimensional array.
	// The polyphase filters are arranged in the array as shown in
	// this example:
	//
	// AAABBBCCCDDD
	//
	// Where A,B,C,D are the coefficients of polyphase filters A,B,C,D.
	// Upsampling factor is 4 (that's why there are four filters). Each
	// filter has 3 taps in this example.

	// NOTE: It would be useful to reverse the filters, that is,
	// coefficients 1234 -> 4321, to make convolution easier, because
	// then it can be done using a simple multiply-and-add operation.
	// (Convolution does multiply-and-add, but with one of the inputs
	// reversed, so by pre-reversing one, it devolves into a simple
	// multiply-and-add.)

	std::size_t num_filters = p_upsampling_factor;
	std::size_t num_filter_taps = window.size() / num_filters;

	for (std::size_t i = 0; i < window.size(); ++i)
	{
	// Note that we do _not_ scale t by the window size here. So, we do not
	// normalize it in any way. This is intentional: The filter size defines
	// the _quality_ of the filtering. Larger filter means more sinc coefficients,
	// or in other words, we tap a larger range of sinc. If we normalized t,
	// we would always tap the _same_ range of the sinc function (the area
	// around the main lobe).
	// We do however offset t to make sure it ranges from -w/2 to w/2-1 instead of
	// 0 to w-1 . Otherwise we don't get a symmetric sinc tap.
	// TODO: Should it be from -w to +w instead?
	float t = int(i) - int(window.size() / 2);

	// Compute stretched sinc.
	float f = sinc(t / scale_factor);

	std::size_t polyphase_filter_index = i % num_filters;
	std::size_t offset_in_polyphase_filter = i / num_filters;
	std::size_t filter_array_idx = polyphase_filter_index * num_filter_taps + offset_in_polyphase_filter;

	// Apply the window function on top of the sinc
	// function to produce filter coefficients.
	filter[filter_array_idx] = f * window[i];
	}

	// TODO: It is unclear why this is necessary, but without this,
	// the output signal may be incorrectly amplified.
	if (p_downsampling_factor > p_upsampling_factor)
	{
	float dampening_factor = float(p_upsampling_factor) / p_downsampling_factor;
	for (auto & filter_coefficient : filter)
	filter_coefficient *= dampening_factor;
	}

	return filter;
	}


	//// Sound I/O functionality ////

	class sound_file
	{
	public:
	sound_file()
	: m_sndfile(nullptr)
	, m_num_samples(0)
	, m_sample_rate(0)
	{
	}

	bool open_for_reading(std::string const &p_filename)
	{
	if (m_sndfile != nullptr)
	{
	std::cerr << "A file has already been opened\n";
	return false;
	}

	SF_INFO info;
	info.format = 0;

	if (!open_internal(p_filename, true, info))
	return false;

	if (info.channels != 1)
	{
	sf_close(m_sndfile);
	m_sndfile = nullptr;
	std::cerr << "File \"" << p_filename << "\" has " << info.channels << " channels; only mono is supported" << std::endl;
	return false;
	}

	// Technically, this is wrong, since 1 frame is a collection of N
	// samples, with N being the number of channels. But, since we
	// anyway only support mono in this example, it does not matter.
	m_num_samples = info.frames;
	m_sample_rate = info.samplerate;

	return true;
	}

	bool open_for_writing(std::string const &p_filename, unsigned int const p_sample_rate)
	{
	if (m_sndfile != nullptr)
	{
	std::cerr << "A file has already been opened\n";
	return false;
	}

	SF_INFO info;
	info.format = SF_FORMAT_WAV \| SF_FORMAT_FLOAT;
	info.samplerate = p_sample_rate;
	info.channels = 1;

	if (!open_internal(p_filename, false, info))
	return false;

	m_num_samples = 0;
	m_sample_rate = p_sample_rate;

	return true;
	}

	~sound_file()
	{
	if (m_sndfile != nullptr)
	sf_close(m_sndfile);
	}

	std::size_t get_total_num_input_samples() const
	{
	return m_num_samples;
	}

	unsigned int get_sample_rate() const
	{
	return m_sample_rate;
	}

	std::size_t read_all_samples(samples &p_samples)
	{
	return read_samples(p_samples, m_num_samples);
	}

	std::size_t read_samples(samples &p_samples, std::size_t const p_num_samples_to_read)
	{
	if (p_num_samples_to_read > p_samples.size())
	p_samples.resize(p_num_samples_to_read);

	return sf_read_float(m_sndfile, &p_samples[0], p_num_samples_to_read);
	}

	void write_samples(samples const &p_samples, std::size_t p_num_samples_to_write)
	{
	if (p_num_samples_to_write > p_samples.size())
	p_num_samples_to_write = p_samples.size();

	sf_write_float(m_sndfile, &p_samples[0], p_num_samples_to_write);
	}


	private:
	bool open_internal(std::string const &p_filename, bool const p_read, SF_INFO &p_info)
	{
	m_sndfile = sf_open(p_filename.c_str(), p_read ? SFM_READ : SFM_WRITE, &p_info);

	if (m_sndfile == nullptr)
	{
	std::cerr << "Could not open file \"" << p_filename << "\" for " << (p_read ? "reading" : "writing") << std::endl;
	}

	return (m_sndfile != nullptr);
	}

	SNDFILE *m_sndfile;
	std::size_t m_num_samples;
	unsigned int m_sample_rate;
	};


	//// main ////

	int main(int argc, char *argv[])
	{
	// Get the command line arguments.
	if (argc < 5)
	{
	std::cerr << "Usage: " << argv[0] << " <input filename> <output WAV filename> <output sample rate> <filter size>\n";
	return -1;
	}

	std::string input_filename = argv[1];
	std::string output_filename = argv[2];
	unsigned int output_sample_rate = std::stoul(argv[3]);
	std::size_t filter_size = std::stoul(argv[4]);


	#ifdef WITH_RLIMIT
	// Limit the amount of memory this process can allocate to 1 GB to
	// prevent lots of swap activity in case we allocate too much. Since
	// this example just loads the entirety of the input file to memory,
	// this can happen with long tracks. With this rlimit, the process
	// is killed as soon as the limit is hit.
	struct rlimit memlim;
	getrlimit(RLIMIT_AS, &memlim);
	memlim.rlim_cur = std::min(std::size_t(102410241024), memlim.rlim_cur);
	setrlimit(RLIMIT_AS, &memlim);
	#endif


	// Open the input file.
	// NOTE: In this example, we read and sample rate convert the entire
	// input file at once. This is for sake of clarity and simplicity.
	// In production, the conversion would not be done that way. Instead,
	// the input samples would be streamed in and converted on the fly.
	// This saves a ton of memory, and would also work with live streams.

	sound_file input_sound_file;
	if (!input_sound_file.open_for_reading(input_filename))
	return -1;
	samples input_samples;
	input_sound_file.read_all_samples(input_samples);
	if (input_samples.empty())
	{
	std::cerr << "Did not get any input samples\n";
	return -1;
	}

	unsigned int input_sample_rate = input_sound_file.get_sample_rate();


	// Sample rate conversion works by first interpolating the signal
	// to a higher sample rate, then decimating to a lower sample rate.
	//
	// Interpolation means that new samples are introduced in between
	// the original input samples, followed by a lowpass filter that is
	// applied on that augmented input signal. The signal is augmented
	// by inserting zeros in between the original samples. This is
	// called "zero-stuffing". Zero-stuffing introduces copies of the
	// original spectrum, _above_ the original spectrum. This is where
	// the low-pass filter comes in - it cuts off these unwanted copies,
	// leaving us with an upsampled version of the original signal.
	//
	// So, if we want to upsample the signal by an integer factor N,
	// we insert N-1 nullsamples between the original samples.
	//
	// Decimation can then be applied. This simply involves picking
	// every Mth sample, M being the decimation factor. M=1 means no
	// decimation.
	//
	// In short, first we upsample by N by applying interpolation,
	// then we downsample by M by applying decimation. The M/N ratio
	// is the overall sample rate conversion ratio. Upsampling factor
	// N is also the interpolation factor N. Downsampling factor M
	// is also the decimation factor M.
	//
	// The up/downsampling factors are derived from the input and
	// output sample rates. To that end, the least common denominator
	// of the sample rates is determined. That's because the up- and
	// downsampling factors influence filter sizes, and we want filters
	// to not be unnecessarily large. For example, input sample rate
	// 48000 Hz, output sample rate 44100 Hz, that's a ratio of
	// 48000/44100. We could use N=48000, M=44100, but for better
	// efficiency (as explained above), we reduce this and come up
	// with an equal ratio of 160/147.
	//
	// Once the factors are known, we could in theory get the input
	// signal's samples and stuff in nullsamples between these samples.
	// However, that would be wasteful (in the example above it would
	// increase the input signal size by a factor of 160). We can
	// make the observation that during convolution, these nullsamples
	// don't contribute to the output, because these nullsamples are
	// multiplied with filter coefficients and added. So, an equation
	// like:
	//
	// output = sample * coeff1 + 0 * coeff2 + 0 * coeff3 ...
	//
	// will only really be influenced by the original samples, not
	// by the added nullsamples.
	//
	// The idea then is to simply omit these nullsamples and only
	// pick the coefficients that would actually be applied (in the
	// example above, coeff1 would be applied, while coeff2 and
	// coeff3 would not). We observe that in a zerostuffed signal,
	// the original samples appear in every Nth position. So, in this
	// zerostuffed signal:
	//
	// a 0 0 b 0 0 c ..
	//
	// every 3rd sample is an original sample. If we have a filter
	// with 12 taps, then its coefficients can be decomposed into
	// subfilters. The number of filters equal the upsampling factor.
	// In this example, the decomposition would be:
	//
	// Original filter: h1 h2 h3 h4 h5 h6 h7 h8 h9 h10 h11 h12
	// Subfilter 1: h1 h4 h7 h10
	// Subfilter 2: h2 h5 h8 h11
	// Subfilter 3: h3 h6 h9 h12
	//
	// During decimation, we would normally pick a sample from the
	// interpolated signal. Suppose that we didn't use polyphase
	// filters, but instead actually did use zerostuffing and
	// filtered that zerostuffed signal. Suppose upsampling factor
	// N is 3, downsampling factor M is 2.
	//
	// Original signal: i1 i2 i3 i4 i5 i6 i7 ..
	// Zerostuffed signal: i1 0 0 i2 0 0 i3 0 0 i4 0 0 i5 0 0 ..
	// Interpolated signal: i1 a b i2 c d i3 e f i4 g h i6 i j ..
	// (a-j are newly interpolated samples that resulted from
	// convolving the filter with the zerostuffed signal)
	//
	// Decimation factor M = 2 means we pick every 2nd
	// interpolated sample, and get the output signal
	// i1 b c i3 f g i6 ..
	//
	// We optimize this by using polyphase filters, eliminating
	// the need for an intermediate zerostuffed signal. Instead,
	// we pick the subfilter that would produce the sample that
	// we pick during decimation.
	//
	// From the example above, suppose we are about to pick
	// sample "b". to compute "b", we pick the appropriate
	// subfilter. We find this subfilter by computing the
	// position in the interpolated signal. This we call the
	// "interpolated position". In the case of "b", this position
	// would be 2. (Positions start at 0.) We then apply modulo 3
	// (the number of subfilters) to pick the appropriate subfilter.
	// That's subfilter (2 mod 3) = 2 in this case.
	//
	// Then we need the position in the original input signal that
	// corresponds to the "b" sample". This we get by calculating
	// position_in_original_signal = position_in_output_signal * M / N
	// So, in this case, "b" is at position #1 in the output signal,
	// and 1*2/3 = 0. Now we have the position in the input signal
	// where we get the input samples that we want to convolve with
	// the subfilter #2 we picked earlier. This means that this
	// subfilter is convolved with input samples i1 to i4.
	//
	// Another example would be if we wanted to compute sample "g",
	// which is at "interpolated position" position 10 and at output
	// position 5. 10 mod 3 = 1, and 5*2/3 = 3. So, we would convolve
	// input samples i4 to i7 with subfilter #1.


	// Compute the up- and downsampling factors.
	unsigned int sample_rate_lcm = lcm(input_sample_rate, output_sample_rate);
	// This is the interpolation factor N mentioned above.
	unsigned int upsampling_factor = sample_rate_lcm / input_sample_rate;
	// This is the decimation factor M mentioned above.
	unsigned int downsampling_factor = sample_rate_lcm / output_sample_rate;


	// Print some info.
	std::cerr << "Input / output sample rate: " << input_sample_rate << " Hz / " << output_sample_rate << " Hz\n";
	std::cerr << "Up / downsampling factors: " << upsampling_factor << " / " << downsampling_factor << "\n";
	std::cerr << "Filter size: " << filter_size << "\n";


	// Prepare the polyphase filter and the output samples buffer.
	samples polyphase_filter_bank = compute_polyphase_filter_bank(upsampling_factor, downsampling_factor, filter_size);
	samples output_samples(input_samples.size() * upsampling_factor / downsampling_factor, 0);

	std::size_t num_polyphase_filters = upsampling_factor;
	std::size_t polyphase_filter_size = polyphase_filter_bank.size() / num_polyphase_filters;


	// Open the output file.
	sound_file output_sound_file;
	if (!output_sound_file.open_for_writing(output_filename, output_sample_rate))
	return -1;


	// Now perform the sample rate conversion.
	std::size_t output_position = 0;
	std::size_t interpolated_position = 0;
	std::size_t last_progress = 0;
	// Don't iterate over the last filter_size samples. That's because
	// convolve() will access all samples from output_position to
	// (output_position + polyphase_filter_size - 1).
	for (; output_position < output_samples.size() - filter_size; ++output_position, interpolated_position += downsampling_factor)
	{
	// Print some progress. We keep track of the last computed
	// progress to check if we should actually print something.
	// Otherwise, the constant console output could actually
	// slow down this code (and it floods the console with
	// lines of course).
	std::size_t progress = output_position / 100000;
	if (progress != last_progress)
	{
	std::cerr << output_position << "/" << output_samples.size() << "(" << (float(output_position) / float(output_samples.size()) * 100.0f) << "%)\n";
	last_progress = progress;
	}

	// Pick the right subfilter.
	std::size_t polyphase_filter_index = interpolated_position % num_polyphase_filters;
	sample_type const polyphase_filter_coefficients = &(polyphase_filter_bank[polyphase_filter_index polyphase_filter_size]);

	// Pick what input samples we need to convolve with the
	// chosen subfilter.
	sample_type const input_samples_coefficients = &(input_samples[output_position downsampling_factor / upsampling_factor]);

	// Perform the convolution, producing the sample rate
	// converted output.
	output_samples[output_position] = convolve(input_samples_coefficients, polyphase_filter_coefficients, polyphase_filter_size);
	}


	// Write the result.
	output_sound_file.write_samples(output_samples, output_samples.size());


	return 0;
	}