""" Use linear regression to recover or 'fill out' a completely deleted portion of an audio file! This will be using The FSDD, Free-Spoken-Digits-Dataset, an audio dataset put together by Zohar Jackson: cleaned up audio (no dead-space, roughly same length, same bitrate, same samples-per-second rate, etc) samples ready for machine learning. """ import numpy as np import pandas as pd import os from sklearn import linear_model import scipy.io.wavfile as wavfile # # INFO: # Samples = Observations. Each audio file is a single sample # in the dataset. # # Audio Samples = https://en.wikipedia.org/wiki/Sampling_(signal_processing) # Each .wav file is actually just a bunch of numeric samples, "sampled" # from the analog signal. Sampling is a type of discretization. When we # mention 'samples', we mean observations. When we mention 'audio samples', # we mean the actually "features" of the audio file. # # # The goal of this gist is to use multi-target, linear regression to generate # by extrapolation, the missing portion of the test audio file. # # Each one audio_sample features will be the output of an equation, # which is a function of the provided portion of the audio_samples: # # missing_samples = f(provided_samples) # # You can experiment with how much of the audio you want to chop off # and have the computer generate using the Provided_Portion parameter. # # Play with this. This is how much of the audio file will # be provided, in percent. The remaining percent of the file will # be generated via linear extrapolation. Provided_Portion = 0.25 # INFO: You have to download the dataset (audio files) from the website: # https://github.com/Jakobovski/free-spoken-digit-dataset # # Create a regular ol' Python List called 'zero' # Loop through the dataset and load up all 50 of the 0_jackson*.wav files # For each audio file, simply append the audio data (not the sample_rate, # just the data!) to the Python list 'zero': # zero = [] directory = "Datasets/free-spoken-digit-dataset-master/recordings/" for fname in os.listdir(directory): if fname.startswith("0_jackson"): fullname = os.path.join(directory, fname) sample_rate, data = wavfile.read(fullname) zero.append( data ) # # convert zero into a DataFrame and set the dtype to # np.int16, since the input audio files are 16 # bits per sample. This is important otherwise the produced audio samples # will be encoded as 64 bits per sample and will be too short. zeroDF = pd.DataFrame(zero, dtype=np.int16) # # Since these audio clips are unfortunately not length-normalized, # we're going to have to just hard chop them to all be the same length. # Since Pandas would have inserted NANs at any spot to make zero a # perfectly rectangular [n_observed_samples, n_audio_samples] array, # do a dropna on the Y axis here. Then, convert one back into an # NDArray using .values # if zeroDF.isnull().values.any() == True: print("Preprocessing data: dropping all NaN") zeroDF.dropna(axis=1, inplace=True) else: print("Preprocessing data: No NaN found!") zero = zeroDF.values # this is a list # # It's important to know how (many audio_samples samples) long the # data is now. 'zero' is currently shaped [n_samples, n_audio_samples] # n_audio_samples = zero.shape[1] # # Create the linear regression model # model = linear_model.LinearRegression() # # INFO: There are 50 takes of each clip. You want to pull out just one # of them, randomly, and that one will NOT be used in the training of # the model. In other words, the one file we'll be testing / scoring # on will be an unseen sample, independent to the rest of the # training set: from sklearn.utils.validation import check_random_state rng = check_random_state(7) random_idx = rng.randint(zero.shape[0]) test = zero[random_idx] # the test sample train = np.delete(zero, [random_idx], axis=0) # # Print out the shape of train, and the shape of test # train will be shaped: [n_samples, n_audio_samples], where # n_audio_samples are the 'features' of the audio file # test will be shaped [n_audio_features], since it is a single # sample (audio file, e.g. observation). # print(train.shape) print(test.shape) # # INFO: The test data will have two parts, X_test and y_test. X_test is # going to be the first portion of the test audio file, which we will # be providing the computer as input. y_test, the "label" if you will, # is going to be the remaining portion of the audio file. Like such, # the computer will use linear regression to derive the missing # portion of the sound file based off of the training data its received! # # Save the original 'test' clip, the one you're about to delete # half of, so that you can compare it to the 'patched' clip once # you've generated it. # this assume the sample rate is always the same for all samples wavfile.write('OriginalTestClip.wav', sample_rate, test) # # Prepare the TEST data by creating a slice called X_test. It # should have Provided_Portion * n_audio_samples audio sample features, # taken from the test audio file, currently stored in the variable # 'test'. # test_samples = int(Provided_Portion * n_audio_samples) X_test = test[0:test_samples] # first ones # # If the first Provided_Portion * n_audio_samples features were # stored in X_test, then we need to also grab the *remaining* audio # features and store it in y_test. With the remaining features stored # in there, we will be able to R^2 "score" how well our algorithm did # in completing the sound file. # y_test = test[test_samples:] # remaining ones # # Duplicate the same process for X_train, y_train. # X_train = train[:, 0:test_samples] # first ones y_train = train[:, test_samples:] # # SciKit-Learn gets mad if you don't supply your training # data in the form of a 2D arrays: [n_samples, n_features]. # # So if you only have one SAMPLE, such as is our case with X_test, # and y_test, then by calling .reshape(1, -1), you can turn # [n_features] into [1, n_features]. # # X_test = X_test.reshape(1,-1) y_test = y_test.reshape(1,-1) # # Fit the model using the training data and label: # model.fit(X_train, y_train) # # Use the model to predict the 'label' of X_test. # y_test_prediction = model.predict(X_test) # INFO: SciKit-Learn will use float64 to generate the predictions # so let's take those values back to int16: y_test_prediction = y_test_prediction.astype(dtype=np.int16) # # Score how well the prediction would do for some good laughs, # by passing in the test data and test label (y_test). # score = model.score(X_test, y_test) print ("Extrapolation R^2 Score: ", score) # # First, take the first Provided_Portion portion of the test clip, the # part you fed into your linear regression model. Then, stitch that # together with the abomination the predictor model generated for you, # and then save the completed audio clip: completed_clip = np.hstack((X_test, y_test_prediction)) wavfile.write('Extrapolated Clip.wav', sample_rate, completed_clip[0])