"""
Use linear regression to recover or 'fill out' a completely deleted portion of an audio file!
This will be using The FSDD, Free-Spoken-Digits-Dataset, an audio dataset put together by Zohar Jackson:
cleaned up audio (no dead-space, roughly same length, same bitrate, same samples-per-second rate, etc) samples ready for machine learning.
"""
import numpy as np
import pandas as pd
import os

from sklearn import linear_model

import scipy.io.wavfile as wavfile


#
# INFO:
# Samples = Observations. Each audio file  is a single sample
#           in the dataset.
#
# Audio Samples = https://en.wikipedia.org/wiki/Sampling_(signal_processing)
# Each .wav file is actually just a bunch of numeric samples, "sampled"
# from the analog signal. Sampling is a type of discretization. When we
# mention 'samples', we mean observations. When we mention 'audio samples',
# we mean the actually "features" of the audio file.
#
#
# The goal of this gist is to use multi-target, linear regression to generate
# by extrapolation, the missing portion of the test audio file.
#
# Each one audio_sample features will be the output of an equation,
# which is a function of the provided portion of the audio_samples:
#
#    missing_samples = f(provided_samples)
#
# You can experiment with how much of the audio you want to chop off
# and have the computer generate using the Provided_Portion parameter.


#
# Play with this. This is how much of the audio file will
# be provided, in percent. The remaining percent of the file will
# be generated via linear extrapolation.
Provided_Portion = 0.25


# INFO: You have to download the dataset (audio files) from the website:
# https://github.com/Jakobovski/free-spoken-digit-dataset


#
# Create a regular ol' Python List called 'zero'
# Loop through the dataset and load up all 50 of the 0_jackson*.wav files
# For each audio file, simply append the audio data (not the sample_rate,
# just the data!) to the Python list 'zero':
#
zero = []
directory = "Datasets/free-spoken-digit-dataset-master/recordings/"
for fname in os.listdir(directory):
    if fname.startswith("0_jackson"):
        fullname = os.path.join(directory, fname)
        sample_rate, data = wavfile.read(fullname)
        zero.append( data )


# 
# convert zero into a DataFrame and set the dtype to 
# np.int16, since the input audio files are 16
# bits per sample. This is important otherwise the produced audio samples 
# will be encoded as 64 bits per sample and will be too short.

zeroDF = pd.DataFrame(zero, dtype=np.int16)
#
# Since these audio clips are unfortunately not length-normalized,
# we're going to have to just hard chop them to all be the same length.
# Since Pandas would have inserted NANs at any spot to make zero a 
# perfectly rectangular [n_observed_samples, n_audio_samples] array,
# do a dropna on the Y axis here. Then, convert one back into an
# NDArray using .values
#

if zeroDF.isnull().values.any() == True:
  print("Preprocessing data: dropping all NaN")
  zeroDF.dropna(axis=1, inplace=True)
else:
  print("Preprocessing data: No NaN found!")

zero = zeroDF.values # this is a list
#
# It's important to know how (many audio_samples samples) long the
# data is now. 'zero' is currently shaped [n_samples, n_audio_samples]
#

n_audio_samples = zero.shape[1]


#
# Create the linear regression model 
#
model = linear_model.LinearRegression()


#
# INFO: There are 50 takes of each clip. You want to pull out just one
# of them, randomly, and that one will NOT be used in the training of
# the model. In other words, the one file we'll be testing / scoring
# on will be an unseen sample, independent to the rest of the
# training set:
from sklearn.utils.validation import check_random_state

rng   = check_random_state(7) 
random_idx = rng.randint(zero.shape[0])

test  = zero[random_idx] # the test sample
train = np.delete(zero, [random_idx], axis=0)


# 
# Print out the shape of train, and the shape of test
# train will be shaped: [n_samples, n_audio_samples], where
# n_audio_samples are the 'features' of the audio file
# test will be shaped [n_audio_features], since it is a single
# sample (audio file, e.g. observation).
#
print(train.shape)
print(test.shape)


#
# INFO: The test data will have two parts, X_test and y_test. X_test is
# going to be the first portion of the test audio file, which we will
# be providing the computer as input. y_test, the "label" if you will,
# is going to be the remaining portion of the audio file. Like such, 
# the computer will use linear regression to derive the missing
# portion of the sound file based off of the training data its received!


#
# Save the original 'test' clip, the one you're about to delete
# half of, so that you can compare it to the 'patched' clip once 
# you've generated it. 
# this assume the sample rate is always the same for all samples
wavfile.write('OriginalTestClip.wav', sample_rate, test)


#
# Prepare the TEST data by creating a slice called X_test. It
# should have Provided_Portion * n_audio_samples audio sample features,
# taken from the test audio file, currently stored in the variable
# 'test'. 
#
test_samples = int(Provided_Portion * n_audio_samples)
X_test = test[0:test_samples] # first ones


#
# If the first Provided_Portion * n_audio_samples features were
# stored in X_test, then we need to also grab the *remaining* audio
# features and store it in y_test. With the remaining features stored
# in there, we will be able to R^2 "score" how well our algorithm did
# in completing the sound file.
#
y_test = test[test_samples:] # remaining ones


# 
# Duplicate the same process for X_train, y_train. 
#
X_train = train[:, 0:test_samples] # first ones
y_train = train[:, test_samples:]

# 
# SciKit-Learn gets mad if you don't supply your training
# data in the form of a 2D arrays: [n_samples, n_features].
#
# So if you only have one SAMPLE, such as is our case with X_test, 
# and y_test, then by calling .reshape(1, -1), you can turn
# [n_features] into [1, n_features].
#
#
X_test = X_test.reshape(1,-1)
y_test = y_test.reshape(1,-1)

#
# Fit the model using the training data and label:
#
model.fit(X_train, y_train)


# 
# Use the model to predict the 'label' of X_test. 
#
y_test_prediction = model.predict(X_test)


# INFO: SciKit-Learn will use float64 to generate the predictions
# so let's take those values back to int16:
y_test_prediction = y_test_prediction.astype(dtype=np.int16)


# 
# Score how well the prediction would do for some good laughs,
# by passing in the test data and test label (y_test).
#
score = model.score(X_test, y_test)
print ("Extrapolation R^2 Score: ", score)


#
# First, take the first Provided_Portion portion of the test clip, the
# part you fed into your linear regression model. Then, stitch that
# together with the abomination the predictor model generated for you,
# and then save the completed audio clip:
completed_clip = np.hstack((X_test, y_test_prediction))
wavfile.write('Extrapolated Clip.wav', sample_rate, completed_clip[0])