Skip to content

Instantly share code, notes, and snippets.

@kastnerkyle
Last active November 17, 2024 12:01
Show Gist options
  • Save kastnerkyle/179d6e9a88202ab0a2fe to your computer and use it in GitHub Desktop.
Save kastnerkyle/179d6e9a88202ab0a2fe to your computer and use it in GitHub Desktop.

Revisions

  1. kastnerkyle revised this gist Jul 7, 2017. 1 changed file with 2 additions and 0 deletions.
    2 changes: 2 additions & 0 deletions audio_tools.py
    Original file line number Diff line number Diff line change
    @@ -1,3 +1,5 @@
    raise ValueError("DEPRECATED/FROZEN - see https://github.com/kastnerkyle/tools for the latest")

    # License: BSD 3-clause
    # Authors: Kyle Kastner
    # Harvest, Cheaptrick, D4C, WORLD routines based on MATLAB code from M. Morise
  2. kastnerkyle revised this gist May 16, 2017. 1 changed file with 25 additions and 17 deletions.
    42 changes: 25 additions & 17 deletions audio_tools.py
    Original file line number Diff line number Diff line change
    @@ -779,20 +779,20 @@ def lpc_analysis(X, order=8, window_step=128, window_size=2 * 128,
    pad_sizes = [(window_size - window_step) // 2,
    window_size - window_step // 2]
    # TODO: Handling for odd window sizes / steps
    X = np.hstack((np.zeros((X.shape[0], pad_sizes[0])), X,
    np.zeros((X.shape[0], pad_sizes[1]))))
    X = np.hstack((np.zeros((X.shape[0], int(pad_sizes[0]))), X,
    np.zeros((X.shape[0], int(pad_sizes[1])))))
    else:
    pad_sizes = [0, 0]
    X = X[0, :n_windows * window_step]

    lp_coefficients = np.zeros((n_windows, order + 1))
    per_frame_gain = np.zeros((n_windows, 1))
    residual_excitation = np.zeros(
    ((n_windows - 1) * window_step + window_size))
    int(((n_windows - 1) * window_step + window_size)))
    # Pre-emphasis high-pass filter
    X = sg.lfilter([1, -emphasis], 1, X)
    # stride_tricks.as_strided?
    autocorr_X = np.zeros((n_windows, 2 * window_size - 1))
    autocorr_X = np.zeros((n_windows, int(2 * window_size - 1)))
    for window in range(max(n_windows - 1, 1)):
    wtws = int(window * window_step)
    XX = X.ravel()[wtws + np.arange(window_size, dtype="int32")]
    @@ -811,7 +811,7 @@ def lpc_analysis(X, order=8, window_step=128, window_size=2 * 128,
    assign_range = wtws + np.arange(window_size, dtype="int32")
    residual_excitation[assign_range] += residual_signal / gain
    # Throw away first part in overlap mode for proper synthesis
    residual_excitation = residual_excitation[pad_sizes[0]:]
    residual_excitation = residual_excitation[int(pad_sizes[0]):]
    return lp_coefficients, per_frame_gain, residual_excitation


    @@ -1176,7 +1176,7 @@ def sinusoid_analysis(X, input_sample_rate, resample_block=128, copy=True):
    # Should be able to use resample... ?
    # resampled_count = round(len(X) * resample_to / input_sample_rate)
    # X = sg.resample(X, resampled_count, window=sg.hanning(len(X)))
    X = sg.decimate(X, input_sample_rate // resample_to)
    X = sg.decimate(X, input_sample_rate // resample_to, zero_phase=True)
    step_size = 2 * round(resample_block / input_sample_rate * resample_to / 2.)
    a, g, e = lpc_analysis(X, order=8, window_step=step_size,
    window_size=2 * step_size)
    @@ -4088,34 +4088,42 @@ def run_world_mgc_example():
    # this is actually just mcep
    mgc_gamma = 0.0

    temporal_positions_h, f0_h, vuv_h, f0_candidates_h = harvest(d, fs)
    temporal_positions_ct, spectrogram_ct, fs_ct = cheaptrick(d, fs,
    temporal_positions_h, f0_h, vuv_h)
    temporal_positions_d4c, f0_d4c, vuv_d4c, aper_d4c, coarse_aper_d4c = d4c(d, fs,
    temporal_positions_h, f0_h, vuv_h)
    #from sklearn.externals import joblib
    #mem = joblib.Memory("/tmp")
    #mem.clear()

    def enc():
    temporal_positions_h, f0_h, vuv_h, f0_candidates_h = harvest(d, fs)
    temporal_positions_ct, spectrogram_ct, fs_ct = cheaptrick(d, fs,
    temporal_positions_h, f0_h, vuv_h)
    temporal_positions_d4c, f0_d4c, vuv_d4c, aper_d4c, coarse_aper_d4c = d4c(d, fs,
    temporal_positions_h, f0_h, vuv_h)

    mgc_arr = sp2mgc(spectrogram_ct, mgc_order, mgc_alpha, mgc_gamma,
    verbose=True)
    mgc_arr = sp2mgc(spectrogram_ct, mgc_order, mgc_alpha, mgc_gamma,
    verbose=True)
    return mgc_arr, spectrogram_ct, f0_d4c, vuv_d4c, coarse_aper_d4c

    from sklearn.externals import joblib
    mem = joblib.Memory("/tmp")
    mem.clear()

    mgc_arr, spectrogram_ct, f0_d4c, vuv_d4c, coarse_aper_d4c = enc()
    sp_r = mgc2sp(mgc_arr, mgc_alpha, mgc_gamma, fs=fs, verbose=True)

    """
    import matplotlib.pyplot as plt
    plt.imshow(20 * np.log10(sp_r))
    plt.figure()
    plt.imshow(20 * np.log10(spectrogram_ct))
    plt.show()
    raise ValueError()
    """

    y = world_synthesis(f0_d4c, vuv_d4c, coarse_aper_d4c, sp_r, fs)
    #y = world_synthesis(f0_d4c, vuv_d4c, aper_d4c, sp_r, fs)
    wavfile.write("out_mgc.wav", fs, soundsc(y))


    if __name__ == "__main__":
    run_world_mgc_example()
    run_lpc_example()
    #run_world_mgc_example()
    """
    Trying to run all examples will seg fault on my laptop - probably memory!
    Comment individually
  3. kastnerkyle revised this gist Mar 12, 2017. 1 changed file with 10 additions and 29 deletions.
    39 changes: 10 additions & 29 deletions audio_tools.py
    Original file line number Diff line number Diff line change
    @@ -3392,7 +3392,7 @@ def _sp_convert(c_i, order, alpha, gamma, miniter, maxiter, criteria,
    def sp2mgc(sp, order=20, alpha=0.35, gamma=-0.41, miniter=2,
    maxiter=30, criteria=0.001, otype=0, verbose=False):
    """
    Accepts 1D or 2D spectrum (complex or real valued).
    Accepts 1D or 2D one-sided spectrum (complex or real valued).
    If 2D, assumes time is axis 0.
    @@ -3401,11 +3401,14 @@ def sp2mgc(sp, order=20, alpha=0.35, gamma=-0.41, miniter=2,
    Based on r9y9 Julia code
    https://github.com/r9y9/MelGeneralizedCepstrums.jl
    """

    if len(sp.shape) == 1:
    sp = np.concatenate((sp, sp[:, 1:][:, ::-1]), axis=0)
    return _sp2mgc(sp, order=order, alpha=alpha, gamma=gamma,
    miniter=miniter, maxiter=maxiter, criteria=criteria,
    otype=otype, verbose=verbose)
    else:
    sp = np.concatenate((sp, sp[:, 1:][:, ::-1]), axis=1)
    # Slooow, use multiprocessing to speed up a bit
    # http://blog.shenwei.me/python-multiprocessing-pool-difference-between-map-apply-map_async-apply_async/
    # http://stackoverflow.com/questions/5666576/show-the-progress-of-a-python-multiprocessing-pool-map-call
    @@ -3597,11 +3600,9 @@ def mgc2sp(mgc_arr, alpha=0.35, gamma=-0.41, fftlen="auto", fs=None,
    for i in range(len(_mgc_convert_results)):
    _mgc_convert_results.pop()
    c = np.array(final)
    fc = np.fft.rfft(c, axis=-1)
    buf = np.zeros((len(c), fftlen // 2 + 1), dtype=fc.dtype)
    buf += fc.min()
    buf[:len(c), :fc.shape[1]] = fc[:]
    return np.exp(buf.real)
    buf = np.zeros((len(c), fftlen), dtype=c.dtype)
    buf[:, :c.shape[1]] = c[:]
    return np.exp(np.fft.rfft(buf, axis=-1).real)


    def implot(arr, scale=None, title="", cmap="gray"):
    @@ -4079,7 +4080,6 @@ def run_world_mgc_example():
    fs, d = fetch_sample_speech_tapestry()
    d = d.astype("float32") / 2 ** 15

    fftlen = 1024
    # harcoded for 16k from
    # https://github.com/CSTR-Edinburgh/merlin/blob/master/misc/scripts/vocoder/world/extract_features_for_merlin.sh
    mgc_alpha = 0.58
    @@ -4088,7 +4088,6 @@ def run_world_mgc_example():
    # this is actually just mcep
    mgc_gamma = 0.0

    """
    temporal_positions_h, f0_h, vuv_h, f0_candidates_h = harvest(d, fs)
    temporal_positions_ct, spectrogram_ct, fs_ct = cheaptrick(d, fs,
    temporal_positions_h, f0_h, vuv_h)
    @@ -4097,37 +4096,19 @@ def run_world_mgc_example():

    mgc_arr = sp2mgc(spectrogram_ct, mgc_order, mgc_alpha, mgc_gamma,
    verbose=True)
    """

    from sklearn.externals import joblib
    mem = joblib.Memory("/tmp")
    ch = mem.cache(harvest)
    cc = mem.cache(cheaptrick)
    cd = mem.cache(d4c)

    temporal_positions_h, f0_h, vuv_h, f0_candidates_h = ch(d, fs)
    temporal_positions_ct, spectrogram_ct, fs_ct = cc(d, fs,
    temporal_positions_h, f0_h, vuv_h)
    temporal_positions_d4c, f0_d4c, vuv_d4c, aper_d4c, coarse_aper_d4c = cd(d, fs,
    temporal_positions_h, f0_h, vuv_h)

    cs2m = mem.cache(sp2mgc)
    mgc_arr = cs2m(spectrogram_ct, mgc_order, mgc_alpha, mgc_gamma,
    verbose=True)
    mem.clear()

    # sp_r = mgc2sp(mgc_arr, mgc_alpha, mgc_gamma, fs=fs, verbose=True)
    sp_r = mgc2sp(mgc_arr, mgc_alpha, mgc_gamma, fftlen=fftlen,
    verbose=True)
    sp_r = mgc2sp(mgc_arr, mgc_alpha, mgc_gamma, fs=fs, verbose=True)

    import matplotlib.pyplot as plt
    plt.imshow(20 * np.log10(sp_r))
    plt.figure()
    plt.imshow(20 * np.log10(spectrogram_ct))
    plt.show()

    """
    from IPython import embed; embed()
    raise ValueError()
    """
    y = world_synthesis(f0_d4c, vuv_d4c, coarse_aper_d4c, sp_r, fs)
    #y = world_synthesis(f0_d4c, vuv_d4c, aper_d4c, sp_r, fs)
    wavfile.write("out_mgc.wav", fs, soundsc(y))
  4. kastnerkyle revised this gist Mar 12, 2017. 1 changed file with 64 additions and 18 deletions.
    82 changes: 64 additions & 18 deletions audio_tools.py
    Original file line number Diff line number Diff line change
    @@ -2574,8 +2574,7 @@ def cheaptrick(x, fs, temporal_positions, f0_sequence,
    default_f0 = 500
    if fftlen == "auto":
    fftlen = int(2 ** np.ceil(np.log2(3. * float(fs) / f0_low_limit + 1)))
    else:
    raise ValueError("Only fftlen auto currently supported")
    #raise ValueError("Only fftlen auto currently supported")
    fft_size = fftlen
    f0_low_limit = fs * 3.0 / (fft_size - 3.0)
    f0_sequence[vuv == 0] = default_f0
    @@ -3115,12 +3114,22 @@ def _mgc_gc2gc(src_ceps, src_gamma=0., dst_order=None, dst_gamma=0.):
    itr = [2]
    else:
    itr = []

    """
    # old slower version
    for k in itr:
    assert k >= 1
    assert (m - k) >= 0
    cc = src_ceps[k - 1] * dst_ceps[m - k]
    ss2 += (k - 1) * cc
    ss1 += (m - k) * cc
    """

    if len(itr) > 0:
    itr = np.array(itr)
    cc_a = src_ceps[itr - 1] * dst_ceps[m - itr]
    ss2 += ((itr - 1) * cc_a).sum()
    ss1 += ((m - itr) * cc_a).sum()

    if m <= m1 + 1:
    dst_ceps[m - 1] = src_ceps[m - 1] + (dst_gamma * ss2 - src_gamma * ss1)/(m - 1.)
    @@ -3414,13 +3423,16 @@ def sp2mgc(sp, order=20, alpha=0.35, gamma=-0.41, miniter=2,
    if (sz * itr._chunksize) != len(c):
    sz += 1

    last_remaining = None
    while True:
    remaining = itr._number_left
    if verbose:
    print("%i chunks of %i complete" % (sz - remaining, sz))
    if remaining != last_remaining:
    last_remaining = remaining
    print("%i chunks of %i complete" % (sz - remaining, sz))
    if itr.ready():
    break
    time.sleep(10)
    time.sleep(.5)

    """
    # takes ~455s for 630 frames
    @@ -3557,20 +3569,23 @@ def mgc2sp(mgc_arr, alpha=0.35, gamma=-0.41, fftlen="auto", fs=None,
    #itr = p.map(functools.partial(_mgc_convert, alpha=alpha, gamma=gamma, fftlen=fftlen), c)
    #raise ValueError()

    # 500.1 s for 630 frames1 process
    # 500.1 s for 630 frames process
    itr = p.map_async(functools.partial(_mgc_convert, alpha=alpha, gamma=gamma, fftlen=fftlen), c, callback=_mgc_collect_result)

    sz = len(c) // itr._chunksize
    if (sz * itr._chunksize) != len(c):
    sz += 1

    last_remaining = None
    while True:
    remaining = itr._number_left
    if verbose:
    print("%i chunks of %i complete" % (sz - remaining, sz))
    if last_remaining != remaining:
    last_remaining = remaining
    print("%i chunks of %i complete" % (sz - remaining, sz))
    if itr.ready():
    break
    time.sleep(10)
    time.sleep(.5)
    p.close()
    p.join()
    stop = time.time()
    @@ -3581,10 +3596,7 @@ def mgc2sp(mgc_arr, alpha=0.35, gamma=-0.41, fftlen="auto", fs=None,
    final = [o[1] for o in sorted(flat, key=lambda x: x[0])]
    for i in range(len(_mgc_convert_results)):
    _mgc_convert_results.pop()
    # resample
    c = np.array(final)
    #buf = np.zeros((len(c), fftlen), dtype=c.dtype)
    #buf[:len(c), :c.shape[1]] = c[:]
    fc = np.fft.rfft(c, axis=-1)
    buf = np.zeros((len(c), fftlen // 2 + 1), dtype=fc.dtype)
    buf += fc.min()
    @@ -4067,22 +4079,55 @@ def run_world_mgc_example():
    fs, d = fetch_sample_speech_tapestry()
    d = d.astype("float32") / 2 ** 15

    temporal_positions_h, f0_h, vuv_h, f0_candidates_h = harvest(d, fs)
    temporal_positions_ct, spectrogram_ct, fs_ct = cheaptrick(d, fs,
    temporal_positions_h, f0_h, vuv_h)
    temporal_positions_d4c, f0_d4c, vuv_d4c, aper_d4c, coarse_aper_d4c = d4c(d, fs,
    temporal_positions_h, f0_h, vuv_h)

    fftlen = 1024
    # harcoded for 16k from
    # https://github.com/CSTR-Edinburgh/merlin/blob/master/misc/scripts/vocoder/world/extract_features_for_merlin.sh
    mgc_alpha = 0.58
    #mgc_order = 59
    mgc_order = 59
    # this is actually just mcep
    mgc_gamma = 0.0

    """
    temporal_positions_h, f0_h, vuv_h, f0_candidates_h = harvest(d, fs)
    temporal_positions_ct, spectrogram_ct, fs_ct = cheaptrick(d, fs,
    temporal_positions_h, f0_h, vuv_h)
    temporal_positions_d4c, f0_d4c, vuv_d4c, aper_d4c, coarse_aper_d4c = d4c(d, fs,
    temporal_positions_h, f0_h, vuv_h)
    mgc_arr = sp2mgc(spectrogram_ct, mgc_order, mgc_alpha, mgc_gamma,
    verbose=True)
    sp_r = mgc2sp(mgc_arr, mgc_alpha, mgc_gamma, fs=fs, verbose=True)
    """
    from sklearn.externals import joblib
    mem = joblib.Memory("/tmp")
    ch = mem.cache(harvest)
    cc = mem.cache(cheaptrick)
    cd = mem.cache(d4c)

    temporal_positions_h, f0_h, vuv_h, f0_candidates_h = ch(d, fs)
    temporal_positions_ct, spectrogram_ct, fs_ct = cc(d, fs,
    temporal_positions_h, f0_h, vuv_h)
    temporal_positions_d4c, f0_d4c, vuv_d4c, aper_d4c, coarse_aper_d4c = cd(d, fs,
    temporal_positions_h, f0_h, vuv_h)

    cs2m = mem.cache(sp2mgc)
    mgc_arr = cs2m(spectrogram_ct, mgc_order, mgc_alpha, mgc_gamma,
    verbose=True)

    # sp_r = mgc2sp(mgc_arr, mgc_alpha, mgc_gamma, fs=fs, verbose=True)
    sp_r = mgc2sp(mgc_arr, mgc_alpha, mgc_gamma, fftlen=fftlen,
    verbose=True)

    import matplotlib.pyplot as plt
    plt.imshow(20 * np.log10(sp_r))
    plt.figure()
    plt.imshow(20 * np.log10(spectrogram_ct))
    plt.show()

    """
    from IPython import embed; embed()
    raise ValueError()
    """
    y = world_synthesis(f0_d4c, vuv_d4c, coarse_aper_d4c, sp_r, fs)
    #y = world_synthesis(f0_d4c, vuv_d4c, aper_d4c, sp_r, fs)
    wavfile.write("out_mgc.wav", fs, soundsc(y))
    @@ -4093,8 +4138,9 @@ def run_world_mgc_example():
    """
    Trying to run all examples will seg fault on my laptop - probably memory!
    Comment individually
    run_mgc_example()
    run_world_mgc_example()
    run_world_example()
    run_mgc_example()
    run_phase_reconstruction_example()
    run_phase_vq_example()
    run_dct_vq_example()
  5. kastnerkyle revised this gist Mar 12, 2017. 1 changed file with 958 additions and 725 deletions.
    1,683 changes: 958 additions & 725 deletions audio_tools.py
    Original file line number Diff line number Diff line change
    @@ -21,6 +21,10 @@
    import tarfile
    import os
    import copy
    import multiprocessing
    from multiprocessing import Pool
    import functools
    import time
    try:
    import urllib.request as urllib # for backwards compatibility
    except ImportError:
    @@ -2564,14 +2568,15 @@ def cheaptrick_estimate_one_slice(x, fs, current_f0,


    def cheaptrick(x, fs, temporal_positions, f0_sequence,
    vuv, fft_size="auto", q1=-0.15):
    vuv, fftlen="auto", q1=-0.15):
    f0_sequence = f0_sequence.copy()
    f0_low_limit = 71
    default_f0 = 500
    if fft_size == "auto":
    fft_size = int(2 ** np.ceil(np.log2(3. * float(fs) / f0_low_limit + 1)))
    if fftlen == "auto":
    fftlen = int(2 ** np.ceil(np.log2(3. * float(fs) / f0_low_limit + 1)))
    else:
    raise ValueError("Only fft_size auto currently supported")
    raise ValueError("Only fftlen auto currently supported")
    fft_size = fftlen
    f0_low_limit = fs * 3.0 / (fft_size - 3.0)
    f0_sequence[vuv == 0] = default_f0
    spectrogram = np.zeros((int(fft_size / 2.) + 1, len(f0_sequence)))
    @@ -2580,7 +2585,7 @@ def cheaptrick(x, fs, temporal_positions, f0_sequence,
    f0_sequence[i] = default_d0
    spectrogram[:, i] = cheaptrick_estimate_one_slice(x, fs, f0_sequence[i],
    temporal_positions[i], fft_size, q1)
    return temporal_positions, spectrogram, fs
    return temporal_positions, spectrogram.T, fs


    def d4c_love_train(x, fs, current_f0, current_position, threshold):
    @@ -2769,7 +2774,7 @@ def d4c(x, fs, temporal_positions_h, f0_h, vuv_h, threshold="default",
    piece = np.concatenate([[-60], -coarse_aperiodicity.ravel(), [-0.000000000001]])
    part = interp1d(coarse_axis, piece, kind="linear")(frequency_axis) / 20.
    aperiodicity[:, i] = 10 ** part
    return temporal_positions_h, f0_h, vuv_h, aperiodicity, coarse_ap
    return temporal_positions_h, f0_h, vuv_h, aperiodicity.T, coarse_ap.squeeze()


    def world_synthesis_time_base_generation(temporal_positions, f0, fs, vuv,
    @@ -2891,11 +2896,17 @@ def fftfilt(b, x, *n):
    def world_synthesis(f0_d4c, vuv_d4c, aperiodicity_d4c,
    spectrogram_ct, fs_ct, random_seed=1999):

    # swap 0 and 1 axis
    spectrogram_ct = spectrogram_ct.T
    fs = fs_ct
    # coarse -> fine aper
    if aperiodicity_d4c.shape[0] == 1:
    if len(aperiodicity_d4c.shape) == 1 or aperiodicity_d4c.shape[1] == 1:
    print("Coarse aperiodicity detected - interpolating to full size")
    aper = np.zeros_like(spectrogram_ct)
    if len(aperiodicity_d4c.shape) == 1:
    aperiodicity_d4c = aperiodicity_d4c[None, :]
    else:
    aperiodicity_d4c = aperiodicity_d4c.T
    coarse_aper_d4c = aperiodicity_d4c
    frequency_interval = 3000
    upper_limit = 15000
    @@ -2904,6 +2915,7 @@ def world_synthesis(f0_d4c, vuv_d4c, aperiodicity_d4c,
    coarse_axis[-1] = fs / 2.
    f0_low_limit_for_spectrum = 71
    fft_size_for_spectrum = 2 ** np.ceil(np.log2(3 * fs / f0_low_limit_for_spectrum + 1.))

    frequency_axis = np.arange(int(fft_size_for_spectrum / 2) + 1) * float(fs) / fft_size_for_spectrum

    for i in range(len(f0_d4c)):
    @@ -2914,6 +2926,8 @@ def world_synthesis(f0_d4c, vuv_d4c, aperiodicity_d4c,
    part = interp1d(coarse_axis, piece, kind="linear")(frequency_axis) / 20.
    aper[:, i] = 10 ** part
    aperiodicity_d4c = aper
    else:
    aperiodicity_d4c = aperiodicity_d4c.T

    default_f0 = 500.
    random_state = np.random.RandomState(1999)
    @@ -2989,851 +3003,1044 @@ def world_synthesis(f0_d4c, vuv_d4c, aperiodicity_d4c,
    return y


    def implot(arr, scale=None, title="", cmap="gray"):
    import matplotlib.pyplot as plt
    if scale is "specgram":
    # plotting part
    mag = 20. * np.log10(np.abs(arr))
    # Transpose so time is X axis, and invert y axis so
    # frequency is low at bottom
    mag = mag.T[::-1, :]
    else:
    mag = arr
    f, ax = plt.subplots()
    ax.matshow(mag, cmap=cmap)
    plt.axis("off")
    x1 = mag.shape[0]
    y1 = mag.shape[1]
    def _mgc_b2c(wc, c, alpha):
    wc_o = np.zeros_like(wc)
    desired_order = len(wc) - 1
    for i in range(0, len(c))[::-1]:
    prev = copy.copy(wc_o)
    wc_o[0] = c[i]
    if desired_order >= 1:
    wc_o[1] = (1. - alpha ** 2) * prev[0] + alpha * prev[1]
    for m in range(2, desired_order + 1):
    wc_o[m] = prev[m - 1] + alpha * (prev[m] - wc_o[m - 1])
    return wc_o

    def autoaspect(x_range, y_range):
    """
    The aspect to make a plot square with ax.set_aspect in Matplotlib
    """
    mx = max(x_range, y_range)
    mn = min(x_range, y_range)
    if x_range <= y_range:
    return mx / float(mn)
    else:
    return mn / float(mx)
    asp = autoaspect(x1, y1)
    ax.set_aspect(asp)
    plt.title(title)

    def _mgc_ptrans(p, m, alpha):
    d = 0.
    o = 0.

    def test_lpc_to_lsf():
    # Matlab style vectors for testing
    # lsf = [0.7842 1.5605 1.8776 1.8984 2.3593]
    # a = [1.0000 0.6149 0.9899 0.0000 0.0031 -0.0082];
    lsf = [[0.7842, 1.5605, 1.8776, 1.8984, 2.3593],
    [0.7842, 1.5605, 1.8776, 1.8984, 2.3593]]
    a = [[1.0000, 0.6149, 0.9899, 0.0000, 0.0031, -0.0082],
    [1.0000, 0.6149, 0.9899, 0.0000, 0.0031, -0.0082]]
    a = np.array(a)
    lsf = np.array(lsf)
    lsf_r = lpc_to_lsf(a)
    assert_almost_equal(lsf, lsf_r, decimal=4)
    a_r = lsf_to_lpc(lsf)
    assert_almost_equal(a, a_r, decimal=4)
    lsf_r = lpc_to_lsf(a[0])
    assert_almost_equal(lsf[0], lsf_r, decimal=4)
    a_r = lsf_to_lpc(lsf[0])
    assert_almost_equal(a[0], a_r, decimal=4)
    d = p[m]
    for i in range(1, m)[::-1]:
    o = p[i] + alpha * d
    d = p[i]
    p[i] = o

    o = alpha * d
    p[0] = (1. - alpha ** 2) * p[0] + 2 * o

    def test_lpc_analysis_truncate():
    # Test that truncate doesn't crash and actually truncates
    [a, g, e] = lpc_analysis(np.random.randn(85), order=8, window_step=80,
    window_size=80, emphasis=0.9, truncate=True)
    assert(a.shape[0] == 1)

    def _mgc_qtrans(q, m, alpha):
    d = q[1]
    for i in range(2, 2 * m + 1):
    o = q[i] + alpha * d
    d = q[i]
    q[i] = o

    def test_feature_build():
    samplerate, X = fetch_sample_music()
    # MATLAB wavread does normalization
    X = X.astype('float32') / (2 ** 15)
    wsz = 256
    wst = 128
    a, g, e = lpc_analysis(X, order=8, window_step=wst,
    window_size=wsz, emphasis=0.9,
    copy=True)
    v, p = voiced_unvoiced(X, window_size=wsz,
    window_step=wst)
    c = compress(e, n_components=64)
    # First component of a is always 1
    combined = np.hstack((a[:, 1:], g, c[:a.shape[0]]))
    features = np.zeros((a.shape[0], 2 * combined.shape[1]))
    start_indices = v * combined.shape[1]
    start_indices = start_indices.astype('int32')
    end_indices = (v + 1) * combined.shape[1]
    end_indices = end_indices.astype('int32')
    for i in range(features.shape[0]):
    features[i, start_indices[i]:end_indices[i]] = combined[i]

    def _mgc_gain(er, c, m, g):
    t = 0.
    if g != 0:
    for i in range(1, m + 1):
    t += er[i] * c[i]
    return er[0] + g * t
    else:
    return er[0]

    def test_mdct_and_inverse():
    fs, X = fetch_sample_music()
    X_dct = mdct_slow(X)
    X_r = imdct_slow(X_dct)
    assert np.all(np.abs(X_r[:len(X)] - X) < 1E-3)
    assert np.abs(X_r[:len(X)] - X).mean() < 1E-6

    def _mgc_fill_toeplitz(A, t):
    n = len(t)
    for i in range(n):
    for j in range(n):
    A[i, j] = t[i - j] if i - j >= 0 else t[j - i]

    def test_all():
    test_lpc_analysis_truncate()
    test_feature_build()
    test_lpc_to_lsf()
    test_mdct_and_inverse()

    def _mgc_fill_hankel(A, t):
    n = len(t) // 2 + 1
    for i in range(n):
    for j in range(n):
    A[i, j] = t[i + j]

    def run_lpc_example():
    # ae.wav is from
    # http://www.linguistics.ucla.edu/people/hayes/103/Charts/VChart/ae.wav
    # Partially following the formant tutorial here
    # http://www.mathworks.com/help/signal/ug/formant-estimation-with-lpc-coefficients.html

    samplerate, X = fetch_sample_music()
    def _mgc_ignorm(c, gamma):
    if gamma == 0.:
    c[0] = np.log(c[0])
    return c
    gain = c[0] ** gamma
    c[1:] *= gain
    c[0] = (gain - 1.) / gamma

    c = overlap_compress(X, 200, 400)
    X_r = overlap_uncompress(c, 400)
    wavfile.write('lpc_uncompress.wav', samplerate, soundsc(X_r))

    print("Calculating sinusoids")
    f_hz, m = sinusoid_analysis(X, input_sample_rate=16000)
    Xs_sine = sinusoid_synthesis(f_hz, m)
    orig_fname = 'lpc_orig.wav'
    sine_fname = 'lpc_sine_synth.wav'
    wavfile.write(orig_fname, samplerate, soundsc(X))
    wavfile.write(sine_fname, samplerate, soundsc(Xs_sine))
    def _mgc_gnorm(c, gamma):
    if gamma == 0.:
    c[0] = np.exp(c[0])
    return c
    gain = 1. + gamma * c[0]
    c[1:] /= gain
    c[0] = gain ** (1. / gamma)

    lpc_order_list = [8, ]
    dct_components_list = [200, ]
    window_size_list = [400, ]
    # Seems like a dct component size of ~2/3rds the step
    # (1/3rd the window for 50% overlap) works well.
    for lpc_order in lpc_order_list:
    for dct_components in dct_components_list:
    for window_size in window_size_list:
    # 50% overlap
    window_step = window_size // 2
    a, g, e = lpc_analysis(X, order=lpc_order,
    window_step=window_step,
    window_size=window_size, emphasis=0.9,
    copy=True)
    print("Calculating LSF")
    lsf = lpc_to_lsf(a)
    # Not window_size - window_step! Need to implement overlap
    print("Calculating compression")
    c = compress(e, n_components=dct_components,
    window_size=window_step)
    co = overlap_compress(e, n_components=dct_components,
    window_size=window_step)
    block_excitation = uncompress(c, window_size=window_step)
    overlap_excitation = overlap_uncompress(co,
    window_size=window_step)
    a_r = lsf_to_lpc(lsf)
    f, m = lpc_to_frequency(a_r, g)
    block_lpc = lpc_synthesis(a_r, g, block_excitation,
    emphasis=0.9,
    window_step=window_step)
    overlap_lpc = lpc_synthesis(a_r, g, overlap_excitation,
    emphasis=0.9,
    window_step=window_step)
    v, p = voiced_unvoiced(X, window_size=window_size,
    window_step=window_step)
    noisy_lpc = lpc_synthesis(a_r, g, voiced_frames=v,
    emphasis=0.9,
    window_step=window_step)
    if dct_components is None:
    dct_components = window_size
    noisy_fname = 'lpc_noisy_synth_%iwin_%ilpc_%idct.wav' % (
    window_size, lpc_order, dct_components)
    block_fname = 'lpc_block_synth_%iwin_%ilpc_%idct.wav' % (
    window_size, lpc_order, dct_components)
    overlap_fname = 'lpc_overlap_synth_%iwin_%ilpc_%idct.wav' % (
    window_size, lpc_order, dct_components)
    wavfile.write(noisy_fname, samplerate, soundsc(noisy_lpc))
    wavfile.write(block_fname, samplerate,
    soundsc(block_lpc))
    wavfile.write(overlap_fname, samplerate,
    soundsc(overlap_lpc))

    def _mgc_b2mc(mc, alpha):
    m = len(mc)
    o = 0.
    d = mc[m - 1]
    for i in range(m - 1)[::-1]:
    o = mc[i] + alpha * d
    d = mc[i]
    mc[i] = o

    def run_fft_vq_example():
    n_fft = 512
    time_smoothing = 4
    def _pre(list_of_data):
    f_c = np.vstack([stft(dd, n_fft) for dd in list_of_data])
    if len(f_c) % time_smoothing != 0:
    newlen = len(f_c) - len(f_c) % time_smoothing
    f_c = f_c[:newlen]
    f_mag = complex_to_abs(f_c)
    f_phs = complex_to_angle(f_c)
    f_sincos = angle_to_sin_cos(f_phs)
    f_r = np.hstack((f_mag, f_sincos))
    f_r = f_r.reshape((len(f_r) // time_smoothing,
    time_smoothing * f_r.shape[1]))
    return f_r, n_fft

    def preprocess_train(list_of_data, random_state):
    f_r, n_fft = _pre(list_of_data)
    clusters = f_r
    return clusters
    def _mgc_mc2b(mc, alpha):
    itr = range(len(mc) - 1)[::-1]
    for i in itr:
    mc[i] = mc[i] - alpha * mc[i + 1]

    def apply_preprocess(list_of_data, clusters):
    f_r, n_fft = _pre(list_of_data)
    memberships, distances = vq(f_r, clusters)
    vq_r = clusters[memberships]
    vq_r = vq_r.reshape((time_smoothing * len(vq_r),
    vq_r.shape[1] // time_smoothing))
    f_mag = vq_r[:, :n_fft // 2 + 1]
    f_sincos = vq_r[:, n_fft // 2 + 1:]
    extent = f_sincos.shape[1] // 2
    f_phs = sin_cos_to_angle(f_sincos[:, :extent], f_sincos[:, extent:])
    vq_c = abs_and_angle_to_complex(f_mag, f_phs)
    d_k = istft(vq_c, fftsize=n_fft)
    return d_k

    random_state = np.random.RandomState(1999)
    def _mgc_gc2gc(src_ceps, src_gamma=0., dst_order=None, dst_gamma=0.):
    if dst_order == None:
    dst_order = len(src_ceps) - 1

    """
    fs, d = fetch_sample_music()
    sub = int(.8 * d.shape[0])
    d1 = [d[:sub]]
    d2 = [d[sub:]]
    """
    dst_ceps = np.zeros((dst_order + 1,), dtype=src_ceps.dtype)
    dst_order = len(dst_ceps) - 1
    m1 = len(src_ceps) - 1
    dst_ceps[0] = copy.deepcopy(src_ceps[0])

    fs, d = fetch_sample_speech_fruit()
    d1 = d[::8] + d[1::8] + d[2::8] + d[3::8] + d[4::8] + d[5::8] + d[6::8]
    d2 = d[7::8]
    # make sure d1 and d2 aren't the same!
    assert [len(di) for di in d1] != [len(di) for di in d2]
    for m in range(2, dst_order + 2):
    ss1 = 0.
    ss2 = 0.
    min_1 = m1 if (m1 < m - 1) else m - 2
    itr = range(2, min_1 + 2)
    if len(itr) < 1:
    if min_1 + 1 == 2:
    itr = [2]
    else:
    itr = []
    for k in itr:
    assert k >= 1
    assert (m - k) >= 0
    cc = src_ceps[k - 1] * dst_ceps[m - k]
    ss2 += (k - 1) * cc
    ss1 += (m - k) * cc

    clusters = preprocess_train(d1, random_state)
    # Training data
    vq_d1 = apply_preprocess(d1, clusters)
    vq_d2 = apply_preprocess(d2, clusters)
    assert [i != j for i, j in zip(vq_d1.ravel(), vq_d2.ravel())]
    if m <= m1 + 1:
    dst_ceps[m - 1] = src_ceps[m - 1] + (dst_gamma * ss2 - src_gamma * ss1)/(m - 1.)
    else:
    dst_ceps[m - 1] = (dst_gamma * ss2 - src_gamma * ss1) / (m - 1.)
    return dst_ceps

    fix_d1 = np.concatenate(d1)
    fix_d2 = np.concatenate(d2)

    wavfile.write("fft_train_no_agc.wav", fs, soundsc(fix_d1))
    wavfile.write("fft_test_no_agc.wav", fs, soundsc(fix_d2))
    wavfile.write("fft_vq_train_no_agc.wav", fs, soundsc(vq_d1, fs))
    wavfile.write("fft_vq_test_no_agc.wav", fs, soundsc(vq_d2, fs))
    def _mgc_newton(mgc_stored, periodogram, order, alpha, gamma,
    recursion_order, iter_number, y_fft, z_fft, cr, pr, rr, ri,
    qr, qi, Tm, Hm, Tm_plus_Hm, b):
    # a lot of inplace operations to match the Julia code
    cr[1:order + 1] = mgc_stored[1:order + 1]

    agc_d1, freq_d1, energy_d1 = time_attack_agc(fix_d1, fs, .5, 5)
    agc_d2, freq_d2, energy_d2 = time_attack_agc(fix_d2, fs, .5, 5)
    agc_vq_d1, freq_vq_d1, energy_vq_d1 = time_attack_agc(vq_d1, fs, .5, 5)
    agc_vq_d2, freq_vq_d2, energy_vq_d2 = time_attack_agc(vq_d2, fs, .5, 5)
    if alpha != 0:
    cr_res = _mgc_b2c(cr[:recursion_order + 1], cr[:order + 1], -alpha)
    cr[:recursion_order + 1] = cr_res[:]

    wavfile.write("fft_train_agc.wav", fs, soundsc(agc_d1))
    wavfile.write("fft_test_agc.wav", fs, soundsc(agc_d2))
    wavfile.write("fft_vq_train_agc.wav", fs, soundsc(agc_vq_d1, fs))
    wavfile.write("fft_vq_test_agc.wav", fs, soundsc(agc_vq_d2))
    y = sp.fftpack.fft(np.cast["float64"](cr))
    c = mgc_stored
    x = periodogram
    if gamma != 0.:
    gamma_inv = 1. / gamma
    else:
    gamma_inv = np.inf

    if gamma == -1.:
    pr[:] = copy.deepcopy(x)
    new_pr = copy.deepcopy(pr)
    elif gamma == 0.:
    pr[:] = copy.deepcopy(x) / np.exp(2 * np.real(y))
    new_pr = copy.deepcopy(pr)
    else:
    tr = 1. + gamma * np.real(y)
    ti = -gamma * np.imag(y)
    trr = tr * tr
    tii = ti * ti
    s = trr + tii
    t = x * np.power(s, (-gamma_inv))
    t /= s
    pr[:] = t
    rr[:] = tr * t
    ri[:] = ti * t
    t /= s
    qr[:] = (trr - tii) * t
    s = tr * ti * t
    qi[:] = (s + s)
    new_pr = copy.deepcopy(pr)

    def run_dct_vq_example():
    def _pre(list_of_data):
    # Temporal window setting is crucial! - 512 seems OK for music, 256
    # fruit perhaps due to samplerates
    n_dct = 512
    f_r = np.vstack([mdct_slow(dd, n_dct) for dd in list_of_data])
    return f_r, n_dct
    if gamma != -1.:
    """
    print()
    print(pr.sum())
    print(rr.sum())
    print(ri.sum())
    print(qr.sum())
    print(qi.sum())
    print()
    """
    pass

    def preprocess_train(list_of_data, random_state):
    f_r, n_dct = _pre(list_of_data)
    clusters = f_r
    return clusters
    y_fft[:] = copy.deepcopy(pr) + 0.j
    z_fft[:] = np.fft.fft(y_fft) / len(y_fft)
    pr[:] = copy.deepcopy(np.real(z_fft))
    if alpha != 0.:
    idx_1 = pr[:2 * order + 1]
    idx_2 = pr[:recursion_order + 1]
    idx_3 = _mgc_b2c(idx_1, idx_2, alpha)
    pr[:2 * order + 1] = idx_3[:]

    def apply_preprocess(list_of_data, clusters):
    f_r, n_dct = _pre(list_of_data)
    f_clust = f_r
    memberships, distances = vq(f_clust, clusters)
    vq_r = clusters[memberships]
    d_k = imdct_slow(vq_r, n_dct)
    return d_k
    if gamma == 0. or gamma == -1.:
    qr[:2 * order + 1] = pr[:2 * order + 1]
    rr[:order + 1] = copy.deepcopy(pr[:order + 1])
    else:
    for i in range(len(qr)):
    y_fft[i] = qr[i] + 1j * qi[i]
    z_fft[:] = np.fft.fft(y_fft) / len(y_fft)
    qr[:] = np.real(z_fft)

    random_state = np.random.RandomState(1999)
    for i in range(len(rr)):
    y_fft[i] = rr[i] + 1j * ri[i]
    z_fft[:] = np.fft.fft(y_fft) / len(y_fft)
    rr[:] = np.real(z_fft)

    # This doesn't work very well due to only taking a sample from the end as
    # test
    fs, d = fetch_sample_music()
    sub = int(.8 * d.shape[0])
    d1 = [d[:sub]]
    d2 = [d[sub:]]
    if alpha != 0.:
    qr_new = _mgc_b2c(qr[:recursion_order + 1], qr[:recursion_order + 1], alpha)
    qr[:recursion_order + 1] = qr_new[:]
    rr_new = _mgc_b2c(rr[:order + 1], rr[:recursion_order + 1], alpha)
    rr[:order + 1] = rr_new[:]

    """
    fs, d = fetch_sample_speech_fruit()
    d1 = d[::8] + d[1::8] + d[2::8] + d[3::8] + d[4::8] + d[5::8] + d[6::8]
    d2 = d[7::8]
    # make sure d1 and d2 aren't the same!
    assert [len(di) for di in d1] != [len(di) for di in d2]
    """
    if alpha != 0:
    _mgc_ptrans(pr, order, alpha)
    _mgc_qtrans(qr, order, alpha)

    clusters = preprocess_train(d1, random_state)
    # Training data
    vq_d1 = apply_preprocess(d1, clusters)
    vq_d2 = apply_preprocess(d2, clusters)
    assert [i != j for i, j in zip(vq_d2.ravel(), vq_d2.ravel())]
    eta = 0.
    if gamma != -1.:
    eta = _mgc_gain(rr, c, order, gamma)
    c[0] = np.sqrt(eta)

    fix_d1 = np.concatenate(d1)
    fix_d2 = np.concatenate(d2)
    if gamma == -1.:
    qr[:] = 0.
    elif gamma != 0.:
    for i in range(2, 2 * order + 1):
    qr[i] *= 1. + gamma

    wavfile.write("dct_train_no_agc.wav", fs, soundsc(fix_d1))
    wavfile.write("dct_test_no_agc.wav", fs, soundsc(fix_d2))
    wavfile.write("dct_vq_train_no_agc.wav", fs, soundsc(vq_d1))
    wavfile.write("dct_vq_test_no_agc.wav", fs, soundsc(vq_d2))
    te = pr[:order]
    _mgc_fill_toeplitz(Tm, te)
    he = qr[2: 2 * order + 1]
    _mgc_fill_hankel(Hm, he)

    """
    import matplotlib.pyplot as plt
    plt.specgram(vq_d2, cmap="gray")
    plt.figure()
    plt.specgram(fix_d2, cmap="gray")
    plt.show()
    """
    Tm_plus_Hm[:] = Hm[:] + Tm[:]
    b[:order] = rr[1:order + 1]
    res = np.linalg.solve(Tm_plus_Hm, b)
    b[:] = res[:]

    agc_d1, freq_d1, energy_d1 = time_attack_agc(fix_d1, fs, .5, 5)
    agc_d2, freq_d2, energy_d2 = time_attack_agc(fix_d2, fs, .5, 5)
    agc_vq_d1, freq_vq_d1, energy_vq_d1 = time_attack_agc(vq_d1, fs, .5, 5)
    agc_vq_d2, freq_vq_d2, energy_vq_d2 = time_attack_agc(vq_d2, fs, .5, 5)
    c[1:order + 1] += res[:order]

    wavfile.write("dct_train_agc.wav", fs, soundsc(agc_d1))
    wavfile.write("dct_test_agc.wav", fs, soundsc(agc_d2))
    wavfile.write("dct_vq_train_agc.wav", fs, soundsc(agc_vq_d1))
    wavfile.write("dct_vq_test_agc.wav", fs, soundsc(agc_vq_d2))
    if gamma == -1.:
    eta = _mgc_gain(rr, c, order, gamma)
    c[0] = np.sqrt(eta)
    return np.log(eta), new_pr


    def run_phase_reconstruction_example():
    fs, d = fetch_sample_speech_tapestry()
    # actually gives however many components you say! So double what .m file
    # says
    fftsize = 512
    step = 64
    X_s = np.abs(stft(d, fftsize=fftsize, step=step, real=False,
    compute_onesided=False))
    X_t = iterate_invert_spectrogram(X_s, fftsize, step, verbose=True)
    def _mgc_mgcepnorm(b_gamma, alpha, gamma, otype):
    if otype != 0:
    raise ValueError("Not yet implemented for otype != 0")

    """
    import matplotlib.pyplot as plt
    plt.specgram(d, cmap="gray")
    plt.savefig("1.png")
    plt.close()
    plt.imshow(X_s, cmap="gray")
    plt.savefig("2.png")
    plt.close()
    """
    mgc = copy.deepcopy(b_gamma)
    _mgc_ignorm(mgc, gamma)
    _mgc_b2mc(mgc, alpha)
    return mgc

    wavfile.write("phase_original.wav", fs, soundsc(d))
    wavfile.write("phase_reconstruction.wav", fs, soundsc(X_t))

    def _sp2mgc(sp, order=20, alpha=0.35, gamma=-0.41, miniter=2, maxiter=30, criteria=0.001, otype=0, verbose=False):
    # Based on r9y9 Julia code
    # https://github.com/r9y9/MelGeneralizedCepstrums.jl
    periodogram = np.abs(sp) ** 2
    recursion_order = len(periodogram) - 1
    slen = len(periodogram)
    iter_number = 1

    def run_phase_vq_example():
    def _pre(list_of_data):
    # Temporal window setting is crucial! - 512 seems OK for music, 256
    # fruit perhaps due to samplerates
    n_fft = 256
    step = 32
    f_r = np.vstack([np.abs(stft(dd, n_fft, step=step, real=False,
    compute_onesided=False))
    for dd in list_of_data])
    return f_r, n_fft, step
    def _z():
    return np.zeros((slen,), dtype="float64")

    def preprocess_train(list_of_data, random_state):
    f_r, n_fft, step = _pre(list_of_data)
    clusters = copy.deepcopy(f_r)
    return clusters
    def _o():
    return np.zeros((order,), dtype="float64")

    def apply_preprocess(list_of_data, clusters):
    f_r, n_fft, step = _pre(list_of_data)
    f_clust = f_r
    # Nondeterministic ?
    memberships, distances = vq(f_clust, clusters)
    vq_r = clusters[memberships]
    d_k = iterate_invert_spectrogram(vq_r, n_fft, step, verbose=True)
    return d_k
    def _o2():
    return np.zeros((order, order), dtype="float64")

    random_state = np.random.RandomState(1999)
    cr = _z()
    pr = _z()
    rr = _z()
    ri = _z().astype("float128")
    qr = _z()
    qi = _z().astype("float128")
    Tm = _o2()
    Hm = _o2()
    Tm_plus_Hm = _o2()
    b = _o()
    y = _z() + 0j
    z = _z() + 0j
    b_gamma = np.zeros((order + 1,), dtype="float64")
    # return pr_new due to oddness with Julia having different numbers
    # in pr at end of function vs back in this scope
    eta0, pr_new = _mgc_newton(b_gamma, periodogram, order, alpha, -1.,
    recursion_order, iter_number, y, z, cr, pr, rr,
    ri, qr, qi, Tm, Hm, Tm_plus_Hm, b)
    pr[:] = pr_new
    """
    print(eta0)
    print(sum(b_gamma))
    print(sum(periodogram))
    print(order)
    print(alpha)
    print(recursion_order)
    print(sum(y))
    print(sum(cr))
    print(sum(z))
    print(sum(pr))
    print(sum(rr))
    print(sum(qi))
    print(Tm.sum())
    print(Hm.sum())
    print(sum(b))
    raise ValueError()
    """
    if gamma != -1.:
    d = np.zeros((order + 1,), dtype="float64")
    if alpha != 0.:
    _mgc_ignorm(b_gamma, -1.)
    _mgc_b2mc(b_gamma, alpha)
    d = copy.deepcopy(b_gamma)
    _mgc_gnorm(d, -1.)
    # numbers are slightly different here - numerical diffs?
    else:
    d = copy.deepcopy(b_gamma)
    b_gamma = _mgc_gc2gc(d, -1., order, gamma)

    fs, d = fetch_sample_speech_fruit()
    d1 = d[::9]
    d2 = d[7::8][:5]
    # make sure d1 and d2 aren't the same!
    assert [len(di) for di in d1] != [len(di) for di in d2]
    if alpha != 0.:
    _mgc_ignorm(b_gamma, gamma)
    _mgc_mc2b(b_gamma, alpha)
    _mgc_gnorm(b_gamma, gamma)

    clusters = preprocess_train(d1, random_state)
    fix_d1 = np.concatenate(d1)
    fix_d2 = np.concatenate(d2)
    vq_d2 = apply_preprocess(d2, clusters)
    if gamma != -1.:
    eta_t = eta0
    for i in range(1, maxiter + 1):
    eta, pr_new = _mgc_newton(b_gamma, periodogram, order, alpha,
    gamma, recursion_order, i, y, z, cr, pr, rr,
    ri, qr, qi, Tm, Hm, Tm_plus_Hm, b)
    pr[:] = pr_new
    """
    print(eta0)
    print(sum(b_gamma))
    print(sum(periodogram))
    print(order)
    print(alpha)
    print(recursion_order)
    print(sum(y))
    print(sum(cr))
    print(sum(z))
    print(sum(pr))
    print(sum(rr))
    print(sum(qi))
    print(Tm.sum())
    print(Hm.sum())
    print(sum(b))
    raise ValueError()
    """
    err = np.abs((eta_t - eta) / eta)
    if verbose:
    print("iter %i, criterion: %f" % (i, err))
    if i >= miniter:
    if err < criteria:
    if verbose:
    print("optimization complete at iter %i" % i)
    break
    eta_t = eta
    mgc_arr = _mgc_mgcepnorm(b_gamma, alpha, gamma, otype)
    return mgc_arr

    wavfile.write("phase_train_no_agc.wav", fs, soundsc(fix_d1))
    wavfile.write("phase_vq_test_no_agc.wav", fs, soundsc(vq_d2))

    agc_d1, freq_d1, energy_d1 = time_attack_agc(fix_d1, fs, .5, 5)
    agc_d2, freq_d2, energy_d2 = time_attack_agc(fix_d2, fs, .5, 5)
    agc_vq_d2, freq_vq_d2, energy_vq_d2 = time_attack_agc(vq_d2, fs, .5, 5)
    _sp_convert_results = []

    """
    import matplotlib.pyplot as plt
    plt.specgram(agc_vq_d2, cmap="gray")
    #plt.title("Fake")
    plt.figure()
    plt.specgram(agc_d2, cmap="gray")
    #plt.title("Real")
    plt.show()
    """
    def _sp_collect_result(result):
    _sp_convert_results.append(result)

    wavfile.write("phase_train_agc.wav", fs, soundsc(agc_d1))
    wavfile.write("phase_test_agc.wav", fs, soundsc(agc_d2))
    wavfile.write("phase_vq_test_agc.wav", fs, soundsc(agc_vq_d2))

    def _sp_convert(c_i, order, alpha, gamma, miniter, maxiter, criteria,
    otype, verbose):
    i = c_i[0]
    tot_i = c_i[1]
    sp_i = c_i[2]
    r_i = (i, _sp2mgc(sp_i, order=order, alpha=alpha, gamma=gamma,
    miniter=miniter, maxiter=maxiter, criteria=criteria,
    otype=otype, verbose=verbose))
    return r_i

    def run_cqt_example():
    try:
    fs, d = fetch_sample_file("/Users/User/cqt_resources/kempff1.wav")
    except ValueError:
    print("WARNING: Using sample music instead byt kempff1.wav is the example")
    fs, d = fetch_sample_music()
    X = d[:44100]
    X_cq, c_dc, c_nyq, multiscale, shift, window_lens = cqt(X, fs)
    X_r = icqt(X_cq, c_dc, c_nyq, multiscale, shift, window_lens)
    SNR = 20 * np.log10(np.linalg.norm(X - X_r) / np.linalg.norm(X))
    wavfile.write("cqt_original.wav", fs, soundsc(X))
    wavfile.write("cqt_reconstruction.wav", fs, soundsc(X_r))

    def sp2mgc(sp, order=20, alpha=0.35, gamma=-0.41, miniter=2,
    maxiter=30, criteria=0.001, otype=0, verbose=False):
    """
    Accepts 1D or 2D spectrum (complex or real valued).
    def run_fft_dct_example():
    random_state = np.random.RandomState(1999)
    If 2D, assumes time is axis 0.
    fs, d = fetch_sample_speech_fruit()
    n_fft = 64
    X = d[0]
    X_stft = stft(X, n_fft)
    X_rr = complex_to_real_view(X_stft)
    X_dct = fftpack.dct(X_rr, axis=-1, norm='ortho')
    X_dct_sub = X_dct[1:] - X_dct[:-1]
    std = X_dct_sub.std(axis=0, keepdims=True)
    X_dct_sub += .01 * std * random_state.randn(
    X_dct_sub.shape[0], X_dct_sub.shape[1])
    X_dct_unsub = np.cumsum(X_dct_sub, axis=0)
    X_idct = fftpack.idct(X_dct_unsub, axis=-1, norm='ortho')
    X_irr = real_to_complex_view(X_idct)
    X_r = istft(X_irr, n_fft)[:len(X)]
    Returns mel generalized cepstral coefficients.
    SNR = 20 * np.log10(np.linalg.norm(X - X_r) / np.linalg.norm(X))
    print(SNR)
    Based on r9y9 Julia code
    https://github.com/r9y9/MelGeneralizedCepstrums.jl
    """
    if len(sp.shape) == 1:
    return _sp2mgc(sp, order=order, alpha=alpha, gamma=gamma,
    miniter=miniter, maxiter=maxiter, criteria=criteria,
    otype=otype, verbose=verbose)
    else:
    # Slooow, use multiprocessing to speed up a bit
    # http://blog.shenwei.me/python-multiprocessing-pool-difference-between-map-apply-map_async-apply_async/
    # http://stackoverflow.com/questions/5666576/show-the-progress-of-a-python-multiprocessing-pool-map-call
    c = [(i + 1, sp.shape[0], sp[i]) for i in range(sp.shape[0])]
    p = Pool()
    start = time.time()
    if verbose:
    print("Starting conversion of %i frames" % sp.shape[0])
    print("This may take some time...")

    wavfile.write("fftdct_orig.wav", fs, soundsc(X))
    wavfile.write("fftdct_rec.wav", fs, soundsc(X_r))
    # takes ~360s for 630 frames, 1 process
    itr = p.map_async(functools.partial(_sp_convert, order=order, alpha=alpha, gamma=gamma, miniter=miniter, maxiter=maxiter, criteria=criteria, otype=otype, verbose=False), c, callback=_sp_collect_result)

    sz = len(c) // itr._chunksize
    if (sz * itr._chunksize) != len(c):
    sz += 1

    def run_world_example():
    fs, d = fetch_sample_speech_tapestry()
    d = d.astype("float32") / 2 ** 15
    temporal_positions_h, f0_h, vuv_h, f0_candidates_h = harvest(d, fs)
    temporal_positions_ct, spectrogram_ct, fs_ct = cheaptrick(d, fs,
    temporal_positions_h, f0_h, vuv_h)
    temporal_positions_d4c, f0_d4c, vuv_d4c, aper_d4c, coarse_aper_d4c = d4c(d, fs,
    temporal_positions_h, f0_h, vuv_h)
    y = world_synthesis(f0_d4c, vuv_d4c, aper_d4c, spectrogram_ct, fs_ct)
    wavfile.write("out.wav", fs, soundsc(y))
    while True:
    remaining = itr._number_left
    if verbose:
    print("%i chunks of %i complete" % (sz - remaining, sz))
    if itr.ready():
    break
    time.sleep(10)

    def _mgc_b2c(wc, c, alpha):
    wc_o = np.zeros_like(wc)
    desired_order = len(wc) - 1
    for i in range(0, len(c))[::-1]:
    prev = copy.copy(wc_o)
    wc_o[0] = c[i]
    if desired_order >= 1:
    wc_o[1] = (1. - alpha ** 2) * prev[0] + alpha * prev[1]
    for m in range(2, desired_order + 1):
    wc_o[m] = prev[m - 1] + alpha * (prev[m] - wc_o[m - 1])
    return wc_o
    """
    # takes ~455s for 630 frames
    itr = p.imap_unordered(functools.partial(_sp_convert, order=order, alpha=alpha, gamma=gamma, miniter=miniter, maxiter=maxiter, criteria=criteria, otype=otype, verbose=False), c)
    res = []
    # print ~every 5%
    mod = int(len(c)) // 20
    if mod < 1:
    mod = 1
    for i, res_i in enumerate(itr, 1):
    res.append(res_i)
    if i % mod == 0 or i == 1:
    print("%i of %i complete" % (i, len(c)))
    """
    p.close()
    p.join()
    stop = time.time()
    if verbose:
    print("Processed %i frames in %s seconds" % (sp.shape[0], stop - start))
    # map_async result comes in chunks
    flat = [a_i for a in _sp_convert_results for a_i in a]
    final = [o[1] for o in sorted(flat, key=lambda x: x[0])]
    for i in range(len(_sp_convert_results)):
    _sp_convert_results.pop()
    return np.array(final)


    def _mgc_ptrans(p, m, alpha):
    d = 0.
    o = 0.
    def win2mgc(windowed_signal, order=20, alpha=0.35, gamma=-0.41, miniter=2,
    maxiter=30, criteria=0.001, otype=0, verbose=False):
    """
    Accepts 1D or 2D array of windowed signal frames.
    d = p[m]
    for i in range(1, m)[::-1]:
    o = p[i] + alpha * d
    d = p[i]
    p[i] = o
    If 2D, assumes time is axis 0.
    o = alpha * d
    p[0] = (1. - alpha ** 2) * p[0] + 2 * o
    Returns mel generalized cepstral coefficients.
    def _mgc_qtrans(q, m, alpha):
    d = q[1]
    for i in range(2, 2 * m + 1):
    o = q[i] + alpha * d
    d = q[i]
    q[i] = o
    Based on r9y9 Julia code
    https://github.com/r9y9/MelGeneralizedCepstrums.jl
    """
    if len(windowed_signal.shape) == 1:
    sp = np.fft.fft(windowed_signal)
    return _sp2mgc(sp, order=order, alpha=alpha, gamma=gamma,
    miniter=miniter, maxiter=maxiter, criteria=criteria,
    otype=otype, verbose=verbose)
    else:
    raise ValueError("2D input not yet complete for win2mgc")


    def _mgc_gain(er, c, m, g):
    t = 0.
    if g != 0:
    for i in range(1, m + 1):
    t += er[i] * c[i]
    return er[0] + g * t
    def _mgc_freqt(wc, c, alpha):
    prev = np.zeros_like(wc)
    dst_order = len(wc) - 1
    wc *= 0
    m1 = len(c) - 1
    for i in range(-m1, 1, 1):
    prev[:] = wc
    if dst_order >= 0:
    wc[0] = c[-i] + alpha * prev[0]
    if dst_order >= 1:
    wc[1] = (1. - alpha * alpha) * prev[0] + alpha * prev[1]
    for m in range(2, dst_order + 1):
    wc[m] = prev[m - 1] + alpha * (prev[m] - wc[m - 1])


    def _mgc_mgc2mgc(src_ceps, src_alpha, src_gamma, dst_order, dst_alpha, dst_gamma):
    dst_ceps = np.zeros((dst_order + 1,))
    alpha = (dst_alpha - src_alpha) / (1. - dst_alpha * src_alpha)
    if alpha == 0.:
    new_dst_ceps = copy.deepcopy(src_ceps)
    _mgc_gnorm(new_dst_ceps, src_gamma)
    dst_ceps = _mgc_gc2gc(new_dst_ceps, src_gamma, dst_order, dst_gamma)
    _mgc_ignorm(dst_ceps, dst_gamma)
    else:
    return er[0]
    _mgc_freqt(dst_ceps, src_ceps, alpha)
    _mgc_gnorm(dst_ceps, src_gamma)
    new_dst_ceps = copy.deepcopy(dst_ceps)
    dst_ceps = _mgc_gc2gc(new_dst_ceps, src_gamma, dst_order, dst_gamma)
    _mgc_ignorm(dst_ceps, dst_gamma)
    return dst_ceps


    def _mgc_fill_toeplitz(A, t):
    n = len(t)
    for i in range(n):
    for j in range(n):
    A[i, j] = t[i - j] if i - j >= 0 else t[j - i]
    _mgc_convert_results = []

    def _mgc_collect_result(result):
    _mgc_convert_results.append(result)

    def _mgc_fill_hankel(A, t):
    n = len(t) // 2 + 1
    for i in range(n):
    for j in range(n):
    A[i, j] = t[i + j]

    def _mgc_convert(c_i, alpha, gamma, fftlen):
    i = c_i[0]
    tot_i = c_i[1]
    mgc_i = c_i[2]
    r_i = (i, _mgc_mgc2mgc(mgc_i, src_alpha=alpha, src_gamma=gamma,
    dst_order=fftlen // 2, dst_alpha=0., dst_gamma=0.))
    return r_i

    def _mgc_ignorm(c, gamma):
    if gamma == 0.:
    c[0] = np.log(c[0])
    return c
    gain = c[0] ** gamma
    c[1:] *= gain
    c[0] = (gain - 1.) / gamma

    def mgc2sp(mgc_arr, alpha=0.35, gamma=-0.41, fftlen="auto", fs=None,
    mode="world_pad", verbose=False):
    """
    Accepts 1D or 2D array of mgc
    def _mgc_gnorm(c, gamma):
    if gamma == 0.:
    c[0] = np.exp(c[0])
    return c
    gain = 1. + gamma * c[0]
    c[1:] /= gain
    c[0] = gain ** (1. / gamma)
    If 2D, assume time is on axis 0
    Returns reconstructed smooth spectrum
    def _mgc_b2mc(mc, alpha):
    m = len(mc)
    o = 0.
    d = mc[m - 1]
    for i in range(m - 1)[::-1]:
    o = mc[i] + alpha * d
    d = mc[i]
    mc[i] = o
    Based on r9y9 Julia code
    https://github.com/r9y9/MelGeneralizedCepstrums.jl
    """
    if mode != "world_pad":
    raise ValueError("Only currently supported mode is world_pad")

    if fftlen == "auto":
    if fs == None:
    raise ValueError("fs must be provided for fftlen 'auto'")
    f0_low_limit = 71
    fftlen = int(2 ** np.ceil(np.log2(3. * float(fs) / f0_low_limit + 1)))
    if verbose:
    print("setting fftlen to %i" % fftlen)

    if len(mgc_arr.shape) == 1:
    c = _mgc_mgc2mgc(mgc_arr, alpha, gamma, fftlen // 2, 0., 0.)
    buf = np.zeros((fftlen,), dtype=c.dtype)
    buf[:len(c)] = c[:]
    return np.fft.rfft(buf)
    else:
    # Slooow, use multiprocessing to speed up a bit
    # http://blog.shenwei.me/python-multiprocessing-pool-difference-between-map-apply-map_async-apply_async/
    # http://stackoverflow.com/questions/5666576/show-the-progress-of-a-python-multiprocessing-pool-map-call
    c = [(i + 1, mgc_arr.shape[0], mgc_arr[i]) for i in range(mgc_arr.shape[0])]
    p = Pool()
    start = time.time()
    if verbose:
    print("Starting conversion of %i frames" % mgc_arr.shape[0])
    print("This may take some time...")
    #itr = p.map(functools.partial(_mgc_convert, alpha=alpha, gamma=gamma, fftlen=fftlen), c)
    #raise ValueError()

    def _mgc_mc2b(mc, alpha):
    itr = range(len(mc) - 1)[::-1]
    for i in itr:
    mc[i] = mc[i] - alpha * mc[i + 1]
    # 500.1 s for 630 frames1 process
    itr = p.map_async(functools.partial(_mgc_convert, alpha=alpha, gamma=gamma, fftlen=fftlen), c, callback=_mgc_collect_result)

    sz = len(c) // itr._chunksize
    if (sz * itr._chunksize) != len(c):
    sz += 1

    def _mgc_gc2gc(src_ceps, src_gamma=0., dst_order=None, dst_gamma=0.):
    if dst_order == None:
    dst_order = len(src_ceps) - 1
    while True:
    remaining = itr._number_left
    if verbose:
    print("%i chunks of %i complete" % (sz - remaining, sz))
    if itr.ready():
    break
    time.sleep(10)
    p.close()
    p.join()
    stop = time.time()
    if verbose:
    print("Processed %i frames in %s seconds" % (mgc_arr.shape[0], stop - start))
    # map_async result comes in chunks
    flat = [a_i for a in _mgc_convert_results for a_i in a]
    final = [o[1] for o in sorted(flat, key=lambda x: x[0])]
    for i in range(len(_mgc_convert_results)):
    _mgc_convert_results.pop()
    # resample
    c = np.array(final)
    #buf = np.zeros((len(c), fftlen), dtype=c.dtype)
    #buf[:len(c), :c.shape[1]] = c[:]
    fc = np.fft.rfft(c, axis=-1)
    buf = np.zeros((len(c), fftlen // 2 + 1), dtype=fc.dtype)
    buf += fc.min()
    buf[:len(c), :fc.shape[1]] = fc[:]
    return np.exp(buf.real)

    dst_ceps = np.zeros((dst_order + 1,), dtype=src_ceps.dtype)
    dst_order = len(dst_ceps) - 1
    m1 = len(src_ceps) - 1
    dst_ceps[0] = copy.deepcopy(src_ceps[0])

    for m in range(2, dst_order + 2):
    ss1 = 0.
    ss2 = 0.
    min_1 = m1 if (m1 < m - 1) else m - 2
    itr = range(2, min_1 + 2)
    if len(itr) < 1:
    if min_1 + 1 == 2:
    itr = [2]
    else:
    itr = []
    for k in itr:
    assert k >= 1
    assert (m - k) >= 0
    cc = src_ceps[k - 1] * dst_ceps[m - k]
    ss2 += (k - 1) * cc
    ss1 += (m - k) * cc
    def implot(arr, scale=None, title="", cmap="gray"):
    import matplotlib.pyplot as plt
    if scale is "specgram":
    # plotting part
    mag = 20. * np.log10(np.abs(arr))
    # Transpose so time is X axis, and invert y axis so
    # frequency is low at bottom
    mag = mag.T[::-1, :]
    else:
    mag = arr
    f, ax = plt.subplots()
    ax.matshow(mag, cmap=cmap)
    plt.axis("off")
    x1 = mag.shape[0]
    y1 = mag.shape[1]

    if m <= m1 + 1:
    dst_ceps[m - 1] = src_ceps[m - 1] + (dst_gamma * ss2 - src_gamma * ss1)/(m - 1.)
    def autoaspect(x_range, y_range):
    """
    The aspect to make a plot square with ax.set_aspect in Matplotlib
    """
    mx = max(x_range, y_range)
    mn = min(x_range, y_range)
    if x_range <= y_range:
    return mx / float(mn)
    else:
    dst_ceps[m - 1] = (dst_gamma * ss2 - src_gamma * ss1) / (m - 1.)
    return dst_ceps
    return mn / float(mx)
    asp = autoaspect(x1, y1)
    ax.set_aspect(asp)
    plt.title(title)


    def _mgc_newton(mgc_stored, periodogram, order, alpha, gamma,
    recursion_order, iter_number, y_fft, z_fft, cr, pr, rr, ri,
    qr, qi, Tm, Hm, Tm_plus_Hm, b):
    # a lot of inplace operations to match the Julia code
    cr[1:order + 1] = mgc_stored[1:order + 1]
    def test_lpc_to_lsf():
    # Matlab style vectors for testing
    # lsf = [0.7842 1.5605 1.8776 1.8984 2.3593]
    # a = [1.0000 0.6149 0.9899 0.0000 0.0031 -0.0082];
    lsf = [[0.7842, 1.5605, 1.8776, 1.8984, 2.3593],
    [0.7842, 1.5605, 1.8776, 1.8984, 2.3593]]
    a = [[1.0000, 0.6149, 0.9899, 0.0000, 0.0031, -0.0082],
    [1.0000, 0.6149, 0.9899, 0.0000, 0.0031, -0.0082]]
    a = np.array(a)
    lsf = np.array(lsf)
    lsf_r = lpc_to_lsf(a)
    assert_almost_equal(lsf, lsf_r, decimal=4)
    a_r = lsf_to_lpc(lsf)
    assert_almost_equal(a, a_r, decimal=4)
    lsf_r = lpc_to_lsf(a[0])
    assert_almost_equal(lsf[0], lsf_r, decimal=4)
    a_r = lsf_to_lpc(lsf[0])
    assert_almost_equal(a[0], a_r, decimal=4)

    if alpha != 0:
    cr_res = _mgc_b2c(cr[:recursion_order + 1], cr[:order + 1], -alpha)
    cr[:recursion_order + 1] = cr_res[:]

    y = sp.fftpack.fft(np.cast["float64"](cr))
    c = mgc_stored
    x = periodogram
    if gamma != 0.:
    gamma_inv = 1. / gamma
    else:
    gamma_inv = np.inf
    def test_lpc_analysis_truncate():
    # Test that truncate doesn't crash and actually truncates
    [a, g, e] = lpc_analysis(np.random.randn(85), order=8, window_step=80,
    window_size=80, emphasis=0.9, truncate=True)
    assert(a.shape[0] == 1)

    if gamma == -1.:
    pr[:] = copy.deepcopy(x)
    new_pr = copy.deepcopy(pr)
    elif gamma == 0.:
    pr[:] = copy.deepcopy(x) / np.exp(2 * np.real(y))
    new_pr = copy.deepcopy(pr)
    else:
    tr = 1. + gamma * np.real(y)
    ti = -gamma * np.imag(y)
    trr = tr * tr
    tii = ti * ti
    s = trr + tii
    t = x * np.power(s, (-gamma_inv))
    t /= s
    pr[:] = t
    rr[:] = tr * t
    ri[:] = ti * t
    t /= s
    qr[:] = (trr - tii) * t
    s = tr * ti * t
    qi[:] = (s + s)
    new_pr = copy.deepcopy(pr)

    if gamma != -1.:
    """
    print()
    print(pr.sum())
    print(rr.sum())
    print(ri.sum())
    print(qr.sum())
    print(qi.sum())
    print()
    """
    pass
    def test_feature_build():
    samplerate, X = fetch_sample_music()
    # MATLAB wavread does normalization
    X = X.astype('float32') / (2 ** 15)
    wsz = 256
    wst = 128
    a, g, e = lpc_analysis(X, order=8, window_step=wst,
    window_size=wsz, emphasis=0.9,
    copy=True)
    v, p = voiced_unvoiced(X, window_size=wsz,
    window_step=wst)
    c = compress(e, n_components=64)
    # First component of a is always 1
    combined = np.hstack((a[:, 1:], g, c[:a.shape[0]]))
    features = np.zeros((a.shape[0], 2 * combined.shape[1]))
    start_indices = v * combined.shape[1]
    start_indices = start_indices.astype('int32')
    end_indices = (v + 1) * combined.shape[1]
    end_indices = end_indices.astype('int32')
    for i in range(features.shape[0]):
    features[i, start_indices[i]:end_indices[i]] = combined[i]

    y_fft[:] = copy.deepcopy(pr) + 0.j
    z_fft[:] = np.fft.fft(y_fft) / len(y_fft)
    pr[:] = copy.deepcopy(np.real(z_fft))
    if alpha != 0.:
    idx_1 = pr[:2 * order + 1]
    idx_2 = pr[:recursion_order + 1]
    idx_3 = _mgc_b2c(idx_1, idx_2, alpha)
    pr[:2 * order + 1] = idx_3[:]

    if gamma == 0. or gamma == -1.:
    qr[:2 * order + 1] = pr[:2 * order + 1]
    rr[:order + 1] = copy.deepcopy(pr[:order + 1])
    else:
    for i in range(len(qr)):
    y_fft[i] = qr[i] + 1j * qi[i]
    z_fft[:] = np.fft.fft(y_fft) / len(y_fft)
    qr[:] = np.real(z_fft)
    def test_mdct_and_inverse():
    fs, X = fetch_sample_music()
    X_dct = mdct_slow(X)
    X_r = imdct_slow(X_dct)
    assert np.all(np.abs(X_r[:len(X)] - X) < 1E-3)
    assert np.abs(X_r[:len(X)] - X).mean() < 1E-6

    for i in range(len(rr)):
    y_fft[i] = rr[i] + 1j * ri[i]
    z_fft[:] = np.fft.fft(y_fft) / len(y_fft)
    rr[:] = np.real(z_fft)

    if alpha != 0.:
    qr_new = _mgc_b2c(qr[:recursion_order + 1], qr[:recursion_order + 1], alpha)
    qr[:recursion_order + 1] = qr_new[:]
    rr_new = _mgc_b2c(rr[:order + 1], rr[:recursion_order + 1], alpha)
    rr[:order + 1] = rr_new[:]
    def test_all():
    test_lpc_analysis_truncate()
    test_feature_build()
    test_lpc_to_lsf()
    test_mdct_and_inverse()

    if alpha != 0:
    _mgc_ptrans(pr, order, alpha)
    _mgc_qtrans(qr, order, alpha)

    eta = 0.
    if gamma != -1.:
    eta = _mgc_gain(rr, c, order, gamma)
    c[0] = np.sqrt(eta)
    def run_lpc_example():
    # ae.wav is from
    # http://www.linguistics.ucla.edu/people/hayes/103/Charts/VChart/ae.wav
    # Partially following the formant tutorial here
    # http://www.mathworks.com/help/signal/ug/formant-estimation-with-lpc-coefficients.html

    if gamma == -1.:
    qr[:] = 0.
    elif gamma != 0.:
    for i in range(2, 2 * order + 1):
    qr[i] *= 1. + gamma
    samplerate, X = fetch_sample_music()

    c = overlap_compress(X, 200, 400)
    X_r = overlap_uncompress(c, 400)
    wavfile.write('lpc_uncompress.wav', samplerate, soundsc(X_r))

    print("Calculating sinusoids")
    f_hz, m = sinusoid_analysis(X, input_sample_rate=16000)
    Xs_sine = sinusoid_synthesis(f_hz, m)
    orig_fname = 'lpc_orig.wav'
    sine_fname = 'lpc_sine_synth.wav'
    wavfile.write(orig_fname, samplerate, soundsc(X))
    wavfile.write(sine_fname, samplerate, soundsc(Xs_sine))

    lpc_order_list = [8, ]
    dct_components_list = [200, ]
    window_size_list = [400, ]
    # Seems like a dct component size of ~2/3rds the step
    # (1/3rd the window for 50% overlap) works well.
    for lpc_order in lpc_order_list:
    for dct_components in dct_components_list:
    for window_size in window_size_list:
    # 50% overlap
    window_step = window_size // 2
    a, g, e = lpc_analysis(X, order=lpc_order,
    window_step=window_step,
    window_size=window_size, emphasis=0.9,
    copy=True)
    print("Calculating LSF")
    lsf = lpc_to_lsf(a)
    # Not window_size - window_step! Need to implement overlap
    print("Calculating compression")
    c = compress(e, n_components=dct_components,
    window_size=window_step)
    co = overlap_compress(e, n_components=dct_components,
    window_size=window_step)
    block_excitation = uncompress(c, window_size=window_step)
    overlap_excitation = overlap_uncompress(co,
    window_size=window_step)
    a_r = lsf_to_lpc(lsf)
    f, m = lpc_to_frequency(a_r, g)
    block_lpc = lpc_synthesis(a_r, g, block_excitation,
    emphasis=0.9,
    window_step=window_step)
    overlap_lpc = lpc_synthesis(a_r, g, overlap_excitation,
    emphasis=0.9,
    window_step=window_step)
    v, p = voiced_unvoiced(X, window_size=window_size,
    window_step=window_step)
    noisy_lpc = lpc_synthesis(a_r, g, voiced_frames=v,
    emphasis=0.9,
    window_step=window_step)
    if dct_components is None:
    dct_components = window_size
    noisy_fname = 'lpc_noisy_synth_%iwin_%ilpc_%idct.wav' % (
    window_size, lpc_order, dct_components)
    block_fname = 'lpc_block_synth_%iwin_%ilpc_%idct.wav' % (
    window_size, lpc_order, dct_components)
    overlap_fname = 'lpc_overlap_synth_%iwin_%ilpc_%idct.wav' % (
    window_size, lpc_order, dct_components)
    wavfile.write(noisy_fname, samplerate, soundsc(noisy_lpc))
    wavfile.write(block_fname, samplerate,
    soundsc(block_lpc))
    wavfile.write(overlap_fname, samplerate,
    soundsc(overlap_lpc))


    def run_fft_vq_example():
    n_fft = 512
    time_smoothing = 4
    def _pre(list_of_data):
    f_c = np.vstack([stft(dd, n_fft) for dd in list_of_data])
    if len(f_c) % time_smoothing != 0:
    newlen = len(f_c) - len(f_c) % time_smoothing
    f_c = f_c[:newlen]
    f_mag = complex_to_abs(f_c)
    f_phs = complex_to_angle(f_c)
    f_sincos = angle_to_sin_cos(f_phs)
    f_r = np.hstack((f_mag, f_sincos))
    f_r = f_r.reshape((len(f_r) // time_smoothing,
    time_smoothing * f_r.shape[1]))
    return f_r, n_fft

    def preprocess_train(list_of_data, random_state):
    f_r, n_fft = _pre(list_of_data)
    clusters = f_r
    return clusters

    def apply_preprocess(list_of_data, clusters):
    f_r, n_fft = _pre(list_of_data)
    memberships, distances = vq(f_r, clusters)
    vq_r = clusters[memberships]
    vq_r = vq_r.reshape((time_smoothing * len(vq_r),
    vq_r.shape[1] // time_smoothing))
    f_mag = vq_r[:, :n_fft // 2 + 1]
    f_sincos = vq_r[:, n_fft // 2 + 1:]
    extent = f_sincos.shape[1] // 2
    f_phs = sin_cos_to_angle(f_sincos[:, :extent], f_sincos[:, extent:])
    vq_c = abs_and_angle_to_complex(f_mag, f_phs)
    d_k = istft(vq_c, fftsize=n_fft)
    return d_k

    random_state = np.random.RandomState(1999)

    """
    fs, d = fetch_sample_music()
    sub = int(.8 * d.shape[0])
    d1 = [d[:sub]]
    d2 = [d[sub:]]
    """

    fs, d = fetch_sample_speech_fruit()
    d1 = d[::8] + d[1::8] + d[2::8] + d[3::8] + d[4::8] + d[5::8] + d[6::8]
    d2 = d[7::8]
    # make sure d1 and d2 aren't the same!
    assert [len(di) for di in d1] != [len(di) for di in d2]

    clusters = preprocess_train(d1, random_state)
    # Training data
    vq_d1 = apply_preprocess(d1, clusters)
    vq_d2 = apply_preprocess(d2, clusters)
    assert [i != j for i, j in zip(vq_d1.ravel(), vq_d2.ravel())]

    fix_d1 = np.concatenate(d1)
    fix_d2 = np.concatenate(d2)

    wavfile.write("fft_train_no_agc.wav", fs, soundsc(fix_d1))
    wavfile.write("fft_test_no_agc.wav", fs, soundsc(fix_d2))
    wavfile.write("fft_vq_train_no_agc.wav", fs, soundsc(vq_d1, fs))
    wavfile.write("fft_vq_test_no_agc.wav", fs, soundsc(vq_d2, fs))

    agc_d1, freq_d1, energy_d1 = time_attack_agc(fix_d1, fs, .5, 5)
    agc_d2, freq_d2, energy_d2 = time_attack_agc(fix_d2, fs, .5, 5)
    agc_vq_d1, freq_vq_d1, energy_vq_d1 = time_attack_agc(vq_d1, fs, .5, 5)
    agc_vq_d2, freq_vq_d2, energy_vq_d2 = time_attack_agc(vq_d2, fs, .5, 5)

    wavfile.write("fft_train_agc.wav", fs, soundsc(agc_d1))
    wavfile.write("fft_test_agc.wav", fs, soundsc(agc_d2))
    wavfile.write("fft_vq_train_agc.wav", fs, soundsc(agc_vq_d1, fs))
    wavfile.write("fft_vq_test_agc.wav", fs, soundsc(agc_vq_d2))


    def run_dct_vq_example():
    def _pre(list_of_data):
    # Temporal window setting is crucial! - 512 seems OK for music, 256
    # fruit perhaps due to samplerates
    n_dct = 512
    f_r = np.vstack([mdct_slow(dd, n_dct) for dd in list_of_data])
    return f_r, n_dct

    def preprocess_train(list_of_data, random_state):
    f_r, n_dct = _pre(list_of_data)
    clusters = f_r
    return clusters

    def apply_preprocess(list_of_data, clusters):
    f_r, n_dct = _pre(list_of_data)
    f_clust = f_r
    memberships, distances = vq(f_clust, clusters)
    vq_r = clusters[memberships]
    d_k = imdct_slow(vq_r, n_dct)
    return d_k

    random_state = np.random.RandomState(1999)

    # This doesn't work very well due to only taking a sample from the end as
    # test
    fs, d = fetch_sample_music()
    sub = int(.8 * d.shape[0])
    d1 = [d[:sub]]
    d2 = [d[sub:]]

    """
    fs, d = fetch_sample_speech_fruit()
    d1 = d[::8] + d[1::8] + d[2::8] + d[3::8] + d[4::8] + d[5::8] + d[6::8]
    d2 = d[7::8]
    # make sure d1 and d2 aren't the same!
    assert [len(di) for di in d1] != [len(di) for di in d2]
    """

    clusters = preprocess_train(d1, random_state)
    # Training data
    vq_d1 = apply_preprocess(d1, clusters)
    vq_d2 = apply_preprocess(d2, clusters)
    assert [i != j for i, j in zip(vq_d2.ravel(), vq_d2.ravel())]

    fix_d1 = np.concatenate(d1)
    fix_d2 = np.concatenate(d2)

    wavfile.write("dct_train_no_agc.wav", fs, soundsc(fix_d1))
    wavfile.write("dct_test_no_agc.wav", fs, soundsc(fix_d2))
    wavfile.write("dct_vq_train_no_agc.wav", fs, soundsc(vq_d1))
    wavfile.write("dct_vq_test_no_agc.wav", fs, soundsc(vq_d2))

    """
    import matplotlib.pyplot as plt
    plt.specgram(vq_d2, cmap="gray")
    plt.figure()
    plt.specgram(fix_d2, cmap="gray")
    plt.show()
    """

    agc_d1, freq_d1, energy_d1 = time_attack_agc(fix_d1, fs, .5, 5)
    agc_d2, freq_d2, energy_d2 = time_attack_agc(fix_d2, fs, .5, 5)
    agc_vq_d1, freq_vq_d1, energy_vq_d1 = time_attack_agc(vq_d1, fs, .5, 5)
    agc_vq_d2, freq_vq_d2, energy_vq_d2 = time_attack_agc(vq_d2, fs, .5, 5)

    wavfile.write("dct_train_agc.wav", fs, soundsc(agc_d1))
    wavfile.write("dct_test_agc.wav", fs, soundsc(agc_d2))
    wavfile.write("dct_vq_train_agc.wav", fs, soundsc(agc_vq_d1))
    wavfile.write("dct_vq_test_agc.wav", fs, soundsc(agc_vq_d2))

    te = pr[:order]
    _mgc_fill_toeplitz(Tm, te)
    he = qr[2: 2 * order + 1]
    _mgc_fill_hankel(Hm, he)

    Tm_plus_Hm[:] = Hm[:] + Tm[:]
    b[:order] = rr[1:order + 1]
    res = np.linalg.solve(Tm_plus_Hm, b)
    b[:] = res[:]
    def run_phase_reconstruction_example():
    fs, d = fetch_sample_speech_tapestry()
    # actually gives however many components you say! So double what .m file
    # says
    fftsize = 512
    step = 64
    X_s = np.abs(stft(d, fftsize=fftsize, step=step, real=False,
    compute_onesided=False))
    X_t = iterate_invert_spectrogram(X_s, fftsize, step, verbose=True)

    c[1:order + 1] += res[:order]
    """
    import matplotlib.pyplot as plt
    plt.specgram(d, cmap="gray")
    plt.savefig("1.png")
    plt.close()
    plt.imshow(X_s, cmap="gray")
    plt.savefig("2.png")
    plt.close()
    """

    if gamma == -1.:
    eta = _mgc_gain(rr, c, order, gamma)
    c[0] = np.sqrt(eta)
    return np.log(eta), new_pr
    wavfile.write("phase_original.wav", fs, soundsc(d))
    wavfile.write("phase_reconstruction.wav", fs, soundsc(X_t))


    def _mgc_mgcepnorm(b_gamma, alpha, gamma, otype):
    if otype != 0:
    raise ValueError("Not yet implemented for otype != 0")
    def run_phase_vq_example():
    def _pre(list_of_data):
    # Temporal window setting is crucial! - 512 seems OK for music, 256
    # fruit perhaps due to samplerates
    n_fft = 256
    step = 32
    f_r = np.vstack([np.abs(stft(dd, n_fft, step=step, real=False,
    compute_onesided=False))
    for dd in list_of_data])
    return f_r, n_fft, step

    mgc = copy.deepcopy(b_gamma)
    _mgc_ignorm(mgc, gamma)
    _mgc_b2mc(mgc, alpha)
    return mgc
    def preprocess_train(list_of_data, random_state):
    f_r, n_fft, step = _pre(list_of_data)
    clusters = copy.deepcopy(f_r)
    return clusters

    def apply_preprocess(list_of_data, clusters):
    f_r, n_fft, step = _pre(list_of_data)
    f_clust = f_r
    # Nondeterministic ?
    memberships, distances = vq(f_clust, clusters)
    vq_r = clusters[memberships]
    d_k = iterate_invert_spectrogram(vq_r, n_fft, step, verbose=True)
    return d_k

    def mgc(windowed_signal, order=25, alpha=0.35, gamma=0.0):
    # Based on r9y9 Julia code
    # https://github.com/r9y9/MelGeneralizedCepstrums.jl
    miniter = 2
    maxiter = 30
    criteria = 0.001
    otype = 0
    periodogram = np.abs(np.fft.fft(windowed_signal)) ** 2
    recursion_order = len(windowed_signal) - 1
    iter_number = 1
    random_state = np.random.RandomState(1999)

    def _z():
    return np.zeros((len(windowed_signal),), dtype="float64")
    fs, d = fetch_sample_speech_fruit()
    d1 = d[::9]
    d2 = d[7::8][:5]
    # make sure d1 and d2 aren't the same!
    assert [len(di) for di in d1] != [len(di) for di in d2]

    def _o():
    return np.zeros((order,), dtype="float64")
    clusters = preprocess_train(d1, random_state)
    fix_d1 = np.concatenate(d1)
    fix_d2 = np.concatenate(d2)
    vq_d2 = apply_preprocess(d2, clusters)

    def _o2():
    return np.zeros((order, order), dtype="float64")
    wavfile.write("phase_train_no_agc.wav", fs, soundsc(fix_d1))
    wavfile.write("phase_vq_test_no_agc.wav", fs, soundsc(vq_d2))

    agc_d1, freq_d1, energy_d1 = time_attack_agc(fix_d1, fs, .5, 5)
    agc_d2, freq_d2, energy_d2 = time_attack_agc(fix_d2, fs, .5, 5)
    agc_vq_d2, freq_vq_d2, energy_vq_d2 = time_attack_agc(vq_d2, fs, .5, 5)

    cr = _z()
    pr = _z()
    rr = _z()
    ri = _z().astype("float128")
    qr = _z()
    qi = _z().astype("float128")
    Tm = _o2()
    Hm = _o2()
    Tm_plus_Hm = _o2()
    b = _o()
    y = _z() + 0j
    z = _z() + 0j
    b_gamma = np.zeros((order + 1,), dtype="float64")
    # return pr_new due to oddness with Julia having different numbers
    # in pr at end of function vs back in this scope
    eta0, pr_new = _mgc_newton(b_gamma, periodogram, order, alpha, -1.,
    recursion_order, iter_number, y, z, cr, pr, rr,
    ri, qr, qi, Tm, Hm, Tm_plus_Hm, b)
    pr[:] = pr_new
    """
    print(eta0)
    print(sum(b_gamma))
    print(sum(periodogram))
    print(order)
    print(alpha)
    print(recursion_order)
    print(sum(y))
    print(sum(cr))
    print(sum(z))
    print(sum(pr))
    print(sum(rr))
    print(sum(qi))
    print(Tm.sum())
    print(Hm.sum())
    print(sum(b))
    raise ValueError()
    import matplotlib.pyplot as plt
    plt.specgram(agc_vq_d2, cmap="gray")
    #plt.title("Fake")
    plt.figure()
    plt.specgram(agc_d2, cmap="gray")
    #plt.title("Real")
    plt.show()
    """
    if gamma != -1.:
    d = np.zeros((order + 1,), dtype="float64")
    if alpha != 0.:
    _mgc_ignorm(b_gamma, -1.)
    _mgc_b2mc(b_gamma, alpha)
    d = copy.deepcopy(b_gamma)
    _mgc_gnorm(d, -1.)
    # numbers are slightly different here - numerical diffs?
    else:
    d = copy.deepcopy(b_gamma)
    b_gamma = _mgc_gc2gc(d, -1., order, gamma)

    if alpha != 0.:
    _mgc_ignorm(b_gamma, gamma)
    _mgc_mc2b(b_gamma, alpha)
    _mgc_gnorm(b_gamma, gamma)
    wavfile.write("phase_train_agc.wav", fs, soundsc(agc_d1))
    wavfile.write("phase_test_agc.wav", fs, soundsc(agc_d2))
    wavfile.write("phase_vq_test_agc.wav", fs, soundsc(agc_vq_d2))

    if gamma != -1.:
    eta_t = eta0
    for i in range(1, maxiter + 1):
    eta, pr_new = _mgc_newton(b_gamma, periodogram, order, alpha,
    gamma, recursion_order, i, y, z, cr, pr, rr,
    ri, qr, qi, Tm, Hm, Tm_plus_Hm, b)
    pr[:] = pr_new
    """
    print(eta0)
    print(sum(b_gamma))
    print(sum(periodogram))
    print(order)
    print(alpha)
    print(recursion_order)
    print(sum(y))
    print(sum(cr))
    print(sum(z))
    print(sum(pr))
    print(sum(rr))
    print(sum(qi))
    print(Tm.sum())
    print(Hm.sum())
    print(sum(b))
    raise ValueError()
    """
    if i >= miniter:
    err = np.abs((eta_t - eta) / eta)
    if err < criteria:
    break
    eta_t = eta
    mgc_arr = _mgc_mgcepnorm(b_gamma, alpha, gamma, otype)
    return mgc_arr

    def run_cqt_example():
    try:
    fs, d = fetch_sample_file("/Users/User/cqt_resources/kempff1.wav")
    except ValueError:
    print("WARNING: Using sample music instead but kempff1.wav is the example")
    fs, d = fetch_sample_music()
    X = d[:44100]
    X_cq, c_dc, c_nyq, multiscale, shift, window_lens = cqt(X, fs)
    X_r = icqt(X_cq, c_dc, c_nyq, multiscale, shift, window_lens)
    SNR = 20 * np.log10(np.linalg.norm(X - X_r) / np.linalg.norm(X))
    wavfile.write("cqt_original.wav", fs, soundsc(X))
    wavfile.write("cqt_reconstruction.wav", fs, soundsc(X_r))

    def _mgc_freqt(wc, c, alpha):
    prev = np.zeros_like(wc)
    dst_order = len(wc) - 1
    wc *= 0
    m1 = len(c) - 1
    for i in range(-m1, 1, 1):
    prev[:] = wc
    if dst_order >= 0:
    wc[0] = c[-i] + alpha * prev[0]
    if dst_order >= 1:
    wc[1] = (1. - alpha * alpha) * prev[0] + alpha * prev[1]
    for m in range(2, dst_order + 1):
    wc[m] = prev[m - 1] + alpha * (prev[m] - wc[m - 1])

    def run_fft_dct_example():
    random_state = np.random.RandomState(1999)

    def _mgc_mgc2mgc(src_ceps, src_alpha, src_gamma, dst_order, dst_alpha,
    dst_gamma):
    dst_ceps = np.zeros((dst_order + 1,))
    alpha = (dst_alpha - src_alpha) / (1. - dst_alpha * src_alpha)
    if alpha == 0.:
    new_dst_ceps = copy.deepcopy(src_ceps)
    _mgc_gnorm(new_dst_ceps, src_gamma)
    dst_ceps = _mgc_gc2gc(new_dst_ceps, src_gamma, dst_order, dst_gamma)
    _mgc_ignorm(dst_ceps, dst_gamma)
    else:
    _mgc_freqt(dst_ceps, src_ceps, alpha)
    _mgc_gnorm(dst_ceps, src_gamma)
    new_dst_ceps = copy.deepcopy(dst_ceps)
    dst_ceps = _mgc_gc2gc(new_dst_ceps, src_gamma, dst_order, dst_gamma)
    _mgc_ignorm(dst_ceps, dst_gamma)
    return dst_ceps
    fs, d = fetch_sample_speech_fruit()
    n_fft = 64
    X = d[0]
    X_stft = stft(X, n_fft)
    X_rr = complex_to_real_view(X_stft)
    X_dct = fftpack.dct(X_rr, axis=-1, norm='ortho')
    X_dct_sub = X_dct[1:] - X_dct[:-1]
    std = X_dct_sub.std(axis=0, keepdims=True)
    X_dct_sub += .01 * std * random_state.randn(
    X_dct_sub.shape[0], X_dct_sub.shape[1])
    X_dct_unsub = np.cumsum(X_dct_sub, axis=0)
    X_idct = fftpack.idct(X_dct_unsub, axis=-1, norm='ortho')
    X_irr = real_to_complex_view(X_idct)
    X_r = istft(X_irr, n_fft)[:len(X)]

    SNR = 20 * np.log10(np.linalg.norm(X - X_r) / np.linalg.norm(X))
    print(SNR)

    wavfile.write("fftdct_orig.wav", fs, soundsc(X))
    wavfile.write("fftdct_rec.wav", fs, soundsc(X_r))


    def mgc2sp(mgc_arr, alpha, gamma, fftlen):
    c = _mgc_mgc2mgc(mgc_arr, alpha, gamma, fftlen // 2, 0., 0.)
    buf = np.zeros((fftlen,), dtype=c.dtype)
    buf[:len(c)] = c[:]
    return np.fft.rfft(buf)
    def run_world_example():
    fs, d = fetch_sample_speech_tapestry()
    d = d.astype("float32") / 2 ** 15
    temporal_positions_h, f0_h, vuv_h, f0_candidates_h = harvest(d, fs)
    temporal_positions_ct, spectrogram_ct, fs_ct = cheaptrick(d, fs,
    temporal_positions_h, f0_h, vuv_h)
    temporal_positions_d4c, f0_d4c, vuv_d4c, aper_d4c, coarse_aper_d4c = d4c(d, fs,
    temporal_positions_h, f0_h, vuv_h)
    #y = world_synthesis(f0_d4c, vuv_d4c, aper_d4c, spectrogram_ct, fs_ct)
    y = world_synthesis(f0_d4c, vuv_d4c, coarse_aper_d4c, spectrogram_ct, fs_ct)
    wavfile.write("out.wav", fs, soundsc(y))


    def run_mgc_example():
    @@ -3847,7 +4054,7 @@ def run_mgc_example():
    mgc_order = 20
    mgc_alpha = 0.41
    mgc_gamma = -0.35
    mgc_arr = mgc(xw, mgc_order, mgc_alpha, mgc_gamma)
    mgc_arr = win2mgc(xw, order=mgc_order, alpha=mgc_alpha, gamma=mgc_gamma, verbose=True)
    xwsp = 20 * np.log10(np.abs(np.fft.rfft(xw)))
    sp = mgc2sp(mgc_arr, mgc_alpha, mgc_gamma, fftlen)
    plt.plot(xwsp)
    @@ -3856,11 +4063,37 @@ def run_mgc_example():
    plt.show()


    def run_world_mgc_example():
    fs, d = fetch_sample_speech_tapestry()
    d = d.astype("float32") / 2 ** 15

    temporal_positions_h, f0_h, vuv_h, f0_candidates_h = harvest(d, fs)
    temporal_positions_ct, spectrogram_ct, fs_ct = cheaptrick(d, fs,
    temporal_positions_h, f0_h, vuv_h)
    temporal_positions_d4c, f0_d4c, vuv_d4c, aper_d4c, coarse_aper_d4c = d4c(d, fs,
    temporal_positions_h, f0_h, vuv_h)

    # harcoded for 16k from
    # https://github.com/CSTR-Edinburgh/merlin/blob/master/misc/scripts/vocoder/world/extract_features_for_merlin.sh
    mgc_alpha = 0.58
    mgc_order = 59
    # this is actually just mcep
    mgc_gamma = 0.0

    mgc_arr = sp2mgc(spectrogram_ct, mgc_order, mgc_alpha, mgc_gamma,
    verbose=True)
    sp_r = mgc2sp(mgc_arr, mgc_alpha, mgc_gamma, fs=fs, verbose=True)
    y = world_synthesis(f0_d4c, vuv_d4c, coarse_aper_d4c, sp_r, fs)
    #y = world_synthesis(f0_d4c, vuv_d4c, aper_d4c, sp_r, fs)
    wavfile.write("out_mgc.wav", fs, soundsc(y))


    if __name__ == "__main__":
    run_mgc_example()
    run_world_mgc_example()
    """
    Trying to run all examples will seg fault on my laptop - probably memory!
    Comment individually
    run_mgc_example()
    run_world_example()
    run_phase_reconstruction_example()
    run_phase_vq_example()
  6. kastnerkyle revised this gist Mar 10, 2017. 1 changed file with 135 additions and 36 deletions.
    171 changes: 135 additions & 36 deletions audio_tools.py
    Original file line number Diff line number Diff line change
    @@ -6,6 +6,7 @@
    # Pieces also adapted from SPTK
    from __future__ import division
    import numpy as np
    import scipy as sp
    from numpy.lib.stride_tricks import as_strided
    import scipy.signal as sg
    from scipy.interpolate import interp1d
    @@ -25,8 +26,6 @@
    except ImportError:
    import urllib2 as urllib

    from matplotlib import pyplot as plt


    def download(url, server_fname, local_fname=None, progress_update_percentage=5,
    bypass_certificate_check=False):
    @@ -3465,16 +3464,16 @@ def _mgc_ptrans(p, m, alpha):
    o = alpha * d
    p[0] = (1. - alpha ** 2) * p[0] + 2 * o


    def _mgc_qtrans(q, m, alpha):
    d = q[1]
    for i in range(2, 2 * m + 1):
    o = q[i] + alpha * d
    d = q[i]
    q[i] = o


    def _mgc_gain(er, c, m, g):
    t = 0
    t = 0.
    if g != 0:
    for i in range(1, m + 1):
    t += er[i] * c[i]
    @@ -3538,16 +3537,18 @@ def _mgc_gc2gc(src_ceps, src_gamma=0., dst_order=None, dst_gamma=0.):
    dst_ceps = np.zeros((dst_order + 1,), dtype=src_ceps.dtype)
    dst_order = len(dst_ceps) - 1
    m1 = len(src_ceps) - 1
    dst_ceps[0] = src_ceps[0]
    dst_ceps[0] = copy.deepcopy(src_ceps[0])

    for m in range(2, dst_order + 2):
    ss1 = 0.
    ss2 = 0.
    min_1 = m1 if (m1 < m - 1) else m - 2
    if min_1 + 1 < 2:
    itr = range(min_1 + 1, 2 + 1)[::-1]
    else:
    itr = range(2, min_1 + 2)
    itr = range(2, min_1 + 2)
    if len(itr) < 1:
    if min_1 + 1 == 2:
    itr = [2]
    else:
    itr = []
    for k in itr:
    assert k >= 1
    assert (m - k) >= 0
    @@ -3566,14 +3567,13 @@ def _mgc_newton(mgc_stored, periodogram, order, alpha, gamma,
    recursion_order, iter_number, y_fft, z_fft, cr, pr, rr, ri,
    qr, qi, Tm, Hm, Tm_plus_Hm, b):
    # a lot of inplace operations to match the Julia code

    cr[1:order + 1] = mgc_stored[1:order + 1]

    if alpha != 0:
    cr_res = _mgc_b2c(cr[:recursion_order + 1], cr[:order + 1], -alpha)
    cr[:recursion_order + 1] = cr_res[:]

    y = np.fft.fft(cr)
    y = sp.fftpack.fft(np.cast["float64"](cr))
    c = mgc_stored
    x = periodogram
    if gamma != 0.:
    @@ -3588,11 +3588,35 @@ def _mgc_newton(mgc_stored, periodogram, order, alpha, gamma,
    pr[:] = copy.deepcopy(x) / np.exp(2 * np.real(y))
    new_pr = copy.deepcopy(pr)
    else:
    print("gamma else")
    from IPython import embed; embed()
    raise ValueError()
    tr = 1. + gamma * np.real(y)
    ti = -gamma * np.imag(y)
    trr = tr * tr
    tii = ti * ti
    s = trr + tii
    t = x * np.power(s, (-gamma_inv))
    t /= s
    pr[:] = t
    rr[:] = tr * t
    ri[:] = ti * t
    t /= s
    qr[:] = (trr - tii) * t
    s = tr * ti * t
    qi[:] = (s + s)
    new_pr = copy.deepcopy(pr)

    if gamma != -1.:
    """
    print()
    print(pr.sum())
    print(rr.sum())
    print(ri.sum())
    print(qr.sum())
    print(qi.sum())
    print()
    """
    pass

    y_fft[:] = copy.copy(pr) + 0.j
    y_fft[:] = copy.deepcopy(pr) + 0.j
    z_fft[:] = np.fft.fft(y_fft) / len(y_fft)
    pr[:] = copy.deepcopy(np.real(z_fft))
    if alpha != 0.:
    @@ -3605,8 +3629,6 @@ def _mgc_newton(mgc_stored, periodogram, order, alpha, gamma,
    qr[:2 * order + 1] = pr[:2 * order + 1]
    rr[:order + 1] = copy.deepcopy(pr[:order + 1])
    else:
    print("other gamma?")
    raise ValueError()
    for i in range(len(qr)):
    y_fft[i] = qr[i] + 1j * qi[i]
    z_fft[:] = np.fft.fft(y_fft) / len(y_fft)
    @@ -3631,17 +3653,12 @@ def _mgc_newton(mgc_stored, periodogram, order, alpha, gamma,
    if gamma != -1.:
    eta = _mgc_gain(rr, c, order, gamma)
    c[0] = np.sqrt(eta)
    #print(eta)
    #from IPython import embed; embed()
    #raise ValueError()

    if gamma == -1.:
    qr[:] = 0.
    elif gamma != 0.:
    for i in range(2, 2 * order + 1):
    qr[i] *= 1. + gamma
    print(sum(qr))
    raise ValueError("gamma != 0 qr scale")

    te = pr[:order]
    _mgc_fill_toeplitz(Tm, te)
    @@ -3671,7 +3688,7 @@ def _mgc_mgcepnorm(b_gamma, alpha, gamma, otype):
    return mgc


    def mgcep(windowed_signal, order=25, alpha=0.35, gamma=0.0):
    def mgc(windowed_signal, order=25, alpha=0.35, gamma=0.0):
    # Based on r9y9 Julia code
    # https://github.com/r9y9/MelGeneralizedCepstrums.jl
    miniter = 2
    @@ -3683,20 +3700,20 @@ def mgcep(windowed_signal, order=25, alpha=0.35, gamma=0.0):
    iter_number = 1

    def _z():
    return np.zeros((len(windowed_signal),), dtype="float32")
    return np.zeros((len(windowed_signal),), dtype="float64")

    def _o():
    return np.zeros((order,), dtype="float32")
    return np.zeros((order,), dtype="float64")

    def _o2():
    return np.zeros((order, order), dtype="float32")
    return np.zeros((order, order), dtype="float64")

    cr = _z()
    pr = _z()
    rr = _z()
    ri = _z()
    ri = _z().astype("float128")
    qr = _z()
    qi = _z()
    qi = _z().astype("float128")
    Tm = _o2()
    Hm = _o2()
    Tm_plus_Hm = _o2()
    @@ -3710,6 +3727,24 @@ def _o2():
    recursion_order, iter_number, y, z, cr, pr, rr,
    ri, qr, qi, Tm, Hm, Tm_plus_Hm, b)
    pr[:] = pr_new
    """
    print(eta0)
    print(sum(b_gamma))
    print(sum(periodogram))
    print(order)
    print(alpha)
    print(recursion_order)
    print(sum(y))
    print(sum(cr))
    print(sum(z))
    print(sum(pr))
    print(sum(rr))
    print(sum(qi))
    print(Tm.sum())
    print(Hm.sum())
    print(sum(b))
    raise ValueError()
    """
    if gamma != -1.:
    d = np.zeros((order + 1,), dtype="float64")
    if alpha != 0.:
    @@ -3721,6 +3756,7 @@ def _o2():
    else:
    d = copy.deepcopy(b_gamma)
    b_gamma = _mgc_gc2gc(d, -1., order, gamma)

    if alpha != 0.:
    _mgc_ignorm(b_gamma, gamma)
    _mgc_mc2b(b_gamma, alpha)
    @@ -3733,28 +3769,91 @@ def _o2():
    gamma, recursion_order, i, y, z, cr, pr, rr,
    ri, qr, qi, Tm, Hm, Tm_plus_Hm, b)
    pr[:] = pr_new
    """
    print(eta0)
    print(sum(b_gamma))
    print(sum(periodogram))
    print(order)
    print(alpha)
    print(recursion_order)
    print(sum(y))
    print(sum(cr))
    print(sum(z))
    print(sum(pr))
    print(sum(rr))
    print(sum(qi))
    print(Tm.sum())
    print(Hm.sum())
    print(sum(b))
    raise ValueError()
    """
    if i >= miniter:
    err = np.abs((eta_t - eta) / eta)
    if err < criteria:
    break
    eta_t = eta
    mgc = _mgc_mgcepnorm(b_gamma, alpha, gamma, otype)
    return mgc
    mgc_arr = _mgc_mgcepnorm(b_gamma, alpha, gamma, otype)
    return mgc_arr


    def _mgc_freqt(wc, c, alpha):
    prev = np.zeros_like(wc)
    dst_order = len(wc) - 1
    wc *= 0
    m1 = len(c) - 1
    for i in range(-m1, 1, 1):
    prev[:] = wc
    if dst_order >= 0:
    wc[0] = c[-i] + alpha * prev[0]
    if dst_order >= 1:
    wc[1] = (1. - alpha * alpha) * prev[0] + alpha * prev[1]
    for m in range(2, dst_order + 1):
    wc[m] = prev[m - 1] + alpha * (prev[m] - wc[m - 1])


    def _mgc_mgc2mgc(src_ceps, src_alpha, src_gamma, dst_order, dst_alpha,
    dst_gamma):
    dst_ceps = np.zeros((dst_order + 1,))
    alpha = (dst_alpha - src_alpha) / (1. - dst_alpha * src_alpha)
    if alpha == 0.:
    new_dst_ceps = copy.deepcopy(src_ceps)
    _mgc_gnorm(new_dst_ceps, src_gamma)
    dst_ceps = _mgc_gc2gc(new_dst_ceps, src_gamma, dst_order, dst_gamma)
    _mgc_ignorm(dst_ceps, dst_gamma)
    else:
    _mgc_freqt(dst_ceps, src_ceps, alpha)
    _mgc_gnorm(dst_ceps, src_gamma)
    new_dst_ceps = copy.deepcopy(dst_ceps)
    dst_ceps = _mgc_gc2gc(new_dst_ceps, src_gamma, dst_order, dst_gamma)
    _mgc_ignorm(dst_ceps, dst_gamma)
    return dst_ceps


    def mgc2sp(mgc_arr, alpha, gamma, fftlen):
    c = _mgc_mgc2mgc(mgc_arr, alpha, gamma, fftlen // 2, 0., 0.)
    buf = np.zeros((fftlen,), dtype=c.dtype)
    buf[:len(c)] = c[:]
    return np.fft.rfft(buf)


    def run_mgc_example():
    import matplotlib.pyplot as plt
    fs, x = wavfile.read("test16k.wav")
    pos = 3000
    fftlen = 1024
    win = np.blackman(fftlen) / np.sqrt(np.sum(np.blackman(fftlen) ** 2))
    xw = x[pos:pos + fftlen] * win
    sp = 20 * np.log10(np.abs(np.fft.rfft(xw)))
    mgc_order = 25
    mgc_alpha = 0.35
    mgc_gamma = 0.0
    mgc = mgcep(xw, mgc_order, mgc_alpha, mgc_gamma)
    from IPython import embed; embed()
    raise ValueError()
    mgc_order = 20
    mgc_alpha = 0.41
    mgc_gamma = -0.35
    mgc_arr = mgc(xw, mgc_order, mgc_alpha, mgc_gamma)
    xwsp = 20 * np.log10(np.abs(np.fft.rfft(xw)))
    sp = mgc2sp(mgc_arr, mgc_alpha, mgc_gamma, fftlen)
    plt.plot(xwsp)
    plt.plot(20. / np.log(10) * np.real(sp), "r")
    plt.xlim(1, len(xwsp))
    plt.show()


    if __name__ == "__main__":
  7. kastnerkyle revised this gist Mar 8, 2017. 1 changed file with 144 additions and 29 deletions.
    173 changes: 144 additions & 29 deletions audio_tools.py
    Original file line number Diff line number Diff line change
    @@ -2,6 +2,8 @@
    # Authors: Kyle Kastner
    # Harvest, Cheaptrick, D4C, WORLD routines based on MATLAB code from M. Morise
    # http://ml.cs.yamanashi.ac.jp/world/english/
    # MGC code based on r9y9 (Ryuichi Yamamoto) MelGeneralizedCepstrums.jl
    # Pieces also adapted from SPTK
    from __future__ import division
    import numpy as np
    from numpy.lib.stride_tricks import as_strided
    @@ -3495,33 +3497,104 @@ def _mgc_fill_hankel(A, t):
    A[i, j] = t[i + j]


    def _mgc_ignorm(c, gamma):
    if gamma == 0.:
    c[0] = np.log(c[0])
    return c
    gain = c[0] ** gamma
    c[1:] *= gain
    c[0] = (gain - 1.) / gamma


    def _mgc_gnorm(c, gamma):
    if gamma == 0.:
    c[0] = np.exp(c[0])
    return c
    gain = 1. + gamma * c[0]
    c[1:] /= gain
    c[0] = gain ** (1. / gamma)


    def _mgc_b2mc(mc, alpha):
    m = len(mc)
    o = 0.
    d = mc[m - 1]
    for i in range(m - 1)[::-1]:
    o = mc[i] + alpha * d
    d = mc[i]
    mc[i] = o


    def _mgc_mc2b(mc, alpha):
    itr = range(len(mc) - 1)[::-1]
    for i in itr:
    mc[i] = mc[i] - alpha * mc[i + 1]


    def _mgc_gc2gc(src_ceps, src_gamma=0., dst_order=None, dst_gamma=0.):
    if dst_order == None:
    dst_order = len(src_ceps) - 1

    dst_ceps = np.zeros((dst_order + 1,), dtype=src_ceps.dtype)
    dst_order = len(dst_ceps) - 1
    m1 = len(src_ceps) - 1
    dst_ceps[0] = src_ceps[0]

    for m in range(2, dst_order + 2):
    ss1 = 0.
    ss2 = 0.
    min_1 = m1 if (m1 < m - 1) else m - 2
    if min_1 + 1 < 2:
    itr = range(min_1 + 1, 2 + 1)[::-1]
    else:
    itr = range(2, min_1 + 2)
    for k in itr:
    assert k >= 1
    assert (m - k) >= 0
    cc = src_ceps[k - 1] * dst_ceps[m - k]
    ss2 += (k - 1) * cc
    ss1 += (m - k) * cc

    if m <= m1 + 1:
    dst_ceps[m - 1] = src_ceps[m - 1] + (dst_gamma * ss2 - src_gamma * ss1)/(m - 1.)
    else:
    dst_ceps[m - 1] = (dst_gamma * ss2 - src_gamma * ss1) / (m - 1.)
    return dst_ceps


    def _mgc_newton(mgc_stored, periodogram, order, alpha, gamma,
    recursion_order, iter_number, y_fft, z_fft, cr, pr, rr, ri,
    qr, qi, Tm, Hm, Tm_plus_Hm, b):
    # a lot of inplace operations to match the Julia code

    cr[1:order + 1] = mgc_stored[1:order + 1]

    # will need to figure out copy logic later...
    if alpha != 0:
    cr_res = _mgc_b2c(cr[:recursion_order + 1], cr[:order + 1], -alpha)
    cr[:recursion_order + 1] = cr_res[:]

    y = np.fft.fft(cr)
    c = mgc_stored
    x = periodogram
    gamma_inv = 1. / gamma
    if gamma != 0.:
    gamma_inv = 1. / gamma
    else:
    gamma_inv = np.inf

    if gamma == -1.:
    pr = copy.copy(x)
    pr[:] = copy.deepcopy(x)
    new_pr = copy.deepcopy(pr)
    elif gamma == 0.:
    print("gamma 0")
    from IPython import embed; embed()
    raise ValueError()
    pr[:] = copy.deepcopy(x) / np.exp(2 * np.real(y))
    new_pr = copy.deepcopy(pr)
    else:
    print("gamma else")
    from IPython import embed; embed()
    raise ValueError()

    y_fft = copy.copy(pr) + 0.j
    z_fft = np.fft.fft(y_fft) / len(y_fft)
    pr = np.real(z_fft)
    y_fft[:] = copy.copy(pr) + 0.j
    z_fft[:] = np.fft.fft(y_fft) / len(y_fft)
    pr[:] = copy.deepcopy(np.real(z_fft))
    if alpha != 0.:
    idx_1 = pr[:2 * order + 1]
    idx_2 = pr[:recursion_order + 1]
    @@ -3530,17 +3603,19 @@ def _mgc_newton(mgc_stored, periodogram, order, alpha, gamma,

    if gamma == 0. or gamma == -1.:
    qr[:2 * order + 1] = pr[:2 * order + 1]
    rr[:order + 1] = pr[:order + 1]
    rr[:order + 1] = copy.deepcopy(pr[:order + 1])
    else:
    print("other gamma?")
    raise ValueError()
    for i in range(len(qr)):
    y_fft[i] = qr[i] + 1j * qi[i]
    z_fft = np.fft.fft(y_fft) / len(y_fft)
    qr = np.real(z_fft)
    z_fft[:] = np.fft.fft(y_fft) / len(y_fft)
    qr[:] = np.real(z_fft)

    for i in range(len(rr)):
    y_fft[i] = rr[i] + 1j * ri[i]
    z_fft = np.fft.fft(y_fft) / len(y_fft)
    rr = np.real(z_fft)
    z_fft[:] = np.fft.fft(y_fft) / len(y_fft)
    rr[:] = np.real(z_fft)

    if alpha != 0.:
    qr_new = _mgc_b2c(qr[:recursion_order + 1], qr[:recursion_order + 1], alpha)
    @@ -3553,12 +3628,12 @@ def _mgc_newton(mgc_stored, periodogram, order, alpha, gamma,
    _mgc_qtrans(qr, order, alpha)

    eta = 0.
    if gamma != -1:
    if gamma != -1.:
    eta = _mgc_gain(rr, c, order, gamma)
    c[0] = np.sqrt(eta)
    print(eta)
    print(c[0])
    raise ValueError("eta_0 gamma == -1")
    #print(eta)
    #from IPython import embed; embed()
    #raise ValueError()

    if gamma == -1.:
    qr[:] = 0.
    @@ -3583,17 +3658,27 @@ def _mgc_newton(mgc_stored, periodogram, order, alpha, gamma,
    if gamma == -1.:
    eta = _mgc_gain(rr, c, order, gamma)
    c[0] = np.sqrt(eta)
    return np.log(eta), new_pr

    return np.log(eta)

    def _mgc_mgcepnorm(b_gamma, alpha, gamma, otype):
    if otype != 0:
    raise ValueError("Not yet implemented for otype != 0")

    def _mgcep(windowed_signal, order=26, alpha=0.35, gamma=0.0):
    mgc = copy.deepcopy(b_gamma)
    _mgc_ignorm(mgc, gamma)
    _mgc_b2mc(mgc, alpha)
    return mgc


    def mgcep(windowed_signal, order=25, alpha=0.35, gamma=0.0):
    # Based on r9y9 Julia code
    # https://github.com/r9y9/MelGeneralizedCepstrums.jl
    miniter = 2
    maxiter = 30
    criteria = 0.001
    otype = 0
    periodogram = np.abs(np.fft.fft(windowed_signal)) ** 2
    e_gamma = -1.
    recursion_order = len(windowed_signal) - 1
    iter_number = 1

    @@ -3618,12 +3703,43 @@ def _o2():
    b = _o()
    y = _z() + 0j
    z = _z() + 0j
    b_gamma = np.zeros((order + 1,), dtype="float32")
    eta0 = _mgc_newton(b_gamma, periodogram, order, alpha, e_gamma,
    recursion_order, iter_number, y, z, cr, pr, rr,
    ri, qr, qi, Tm, Hm, Tm_plus_Hm, b)
    from IPython import embed; embed()
    raise ValueError()
    b_gamma = np.zeros((order + 1,), dtype="float64")
    # return pr_new due to oddness with Julia having different numbers
    # in pr at end of function vs back in this scope
    eta0, pr_new = _mgc_newton(b_gamma, periodogram, order, alpha, -1.,
    recursion_order, iter_number, y, z, cr, pr, rr,
    ri, qr, qi, Tm, Hm, Tm_plus_Hm, b)
    pr[:] = pr_new
    if gamma != -1.:
    d = np.zeros((order + 1,), dtype="float64")
    if alpha != 0.:
    _mgc_ignorm(b_gamma, -1.)
    _mgc_b2mc(b_gamma, alpha)
    d = copy.deepcopy(b_gamma)
    _mgc_gnorm(d, -1.)
    # numbers are slightly different here - numerical diffs?
    else:
    d = copy.deepcopy(b_gamma)
    b_gamma = _mgc_gc2gc(d, -1., order, gamma)
    if alpha != 0.:
    _mgc_ignorm(b_gamma, gamma)
    _mgc_mc2b(b_gamma, alpha)
    _mgc_gnorm(b_gamma, gamma)

    if gamma != -1.:
    eta_t = eta0
    for i in range(1, maxiter + 1):
    eta, pr_new = _mgc_newton(b_gamma, periodogram, order, alpha,
    gamma, recursion_order, i, y, z, cr, pr, rr,
    ri, qr, qi, Tm, Hm, Tm_plus_Hm, b)
    pr[:] = pr_new
    if i >= miniter:
    err = np.abs((eta_t - eta) / eta)
    if err < criteria:
    break
    eta_t = eta
    mgc = _mgc_mgcepnorm(b_gamma, alpha, gamma, otype)
    return mgc


    def run_mgc_example():
    @@ -3636,8 +3752,7 @@ def run_mgc_example():
    mgc_order = 25
    mgc_alpha = 0.35
    mgc_gamma = 0.0
    _mgcep(xw, mgc_order, mgc_alpha, mgc_gamma)

    mgc = mgcep(xw, mgc_order, mgc_alpha, mgc_gamma)
    from IPython import embed; embed()
    raise ValueError()

  8. kastnerkyle revised this gist Mar 7, 2017. 1 changed file with 1517 additions and 1267 deletions.
    2,784 changes: 1,517 additions & 1,267 deletions audio_tools.py
    1,517 additions, 1,267 deletions not shown because the diff is too large. Please use a local Git client to view these changes.
  9. kastnerkyle revised this gist Feb 20, 2017. 1 changed file with 52 additions and 18 deletions.
    70 changes: 52 additions & 18 deletions audio_tools.py
    Original file line number Diff line number Diff line change
    @@ -2414,7 +2414,10 @@ def harvest_get_downsampled_signal(x, fs, target_fs):
    if fs < target_fs:
    raise ValueError("CASE NOT HANDLED IN harvest_get_downsampled_signal")
    else:
    y0 = sg.decimate(x, int(decimation_ratio), 3, zero_phase=True)
    try:
    y0 = sg.decimate(x, int(decimation_ratio), 3, zero_phase=True)
    except:
    y0 = sg.decimate(x, int(decimation_ratio), 3)
    actual_fs = fs / decimation_ratio
    y = y0[int(offset / decimation_ratio):-int(offset / decimation_ratio)]
    y = y - np.mean(y)
    @@ -2846,14 +2849,21 @@ def harvest_smooth_f0_contour(f0):
    smoothed_f0 = smoothed_f0[300:-300]
    return smoothed_f0

    def _world_get_temporal_positions(x_len, fs):
    frame_period = 5
    basic_frame_period = 1
    basic_temporal_positions = np.arange(0, x_len / float(fs), basic_frame_period / float(1000))
    temporal_positions = np.arange(0,
    x_len / float(fs),
    frame_period / float(1000))
    return basic_temporal_positions, temporal_positions

    def harvest(x, fs):
    f0_floor = 71
    f0_ceil = 800
    frame_period = 5
    basic_frame_period = 1
    target_fs = 8000
    basic_temporal_positions = np.arange(0, len(x) / float(fs), basic_frame_period / float(1000))
    channels_in_octave = 40.
    basic_temporal_positions, temporal_positions = _world_get_temporal_positions(len(x), fs)
    adjusted_f0_floor = f0_floor * 0.9
    adjusted_f0_ceil = f0_ceil * 1.1
    boundary_f0_list = np.arange(1, np.ceil(np.log2(adjusted_f0_ceil / adjusted_f0_floor) * channels_in_octave) + 1) / float(channels_in_octave)
    @@ -2875,9 +2885,6 @@ def harvest(x, fs):

    connected_f0, vuv = harvest_fix_f0_contour(f0_candidates, f0_scores)
    smoothed_f0 = harvest_smooth_f0_contour(connected_f0)
    temporal_positions = np.arange(0,
    len(x) / float(fs),
    frame_period / float(1000))
    idx = np.minimum(len(smoothed_f0) - 1, np.round(temporal_positions * 1000)).astype("int32")
    f0 = smoothed_f0[idx]
    vuv = vuv[idx]
    @@ -3140,6 +3147,7 @@ def d4c(x, fs, temporal_positions_h, f0_h, vuv_h, threshold="default",
    window_length = np.floor(frequency_interval / (fs / float(fft_size))) * 2 + 1
    window1 = harvest_nuttall(window_length)
    aperiodicity = np.zeros((int(fft_size_for_spectrum / 2) + 1, len(f0)))
    coarse_ap = np.zeros((1, len(f0)))

    frequency_axis = np.arange(int(fft_size_for_spectrum / 2) + 1) * float(fs) / fft_size_for_spectrum
    coarse_axis = np.arange(number_of_aperiodicities + 2) * frequency_interval
    @@ -3154,11 +3162,12 @@ def d4c(x, fs, temporal_positions_h, f0_h, vuv_h, threshold="default",
    coarse_aperiodicity = d4c_estimate_one_slice(x, fs, current_f0,
    frequency_interval, temporal_positions[i], fft_size,
    number_of_aperiodicities, window1)
    coarse_ap[0, i] = coarse_aperiodicity.ravel()[0]
    coarse_aperiodicity = np.maximum(0, coarse_aperiodicity - (current_f0 - 100) * 2. / 100.)
    piece = np.concatenate([[-60], -coarse_aperiodicity.ravel(), [-0.000000000001]])
    part = interp1d(coarse_axis, piece, kind="linear")(frequency_axis) / 20.
    aperiodicity[:, i] = 10 ** part
    return temporal_positions_h, f0_h, vuv_h, aperiodicity
    return temporal_positions_h, f0_h, vuv_h, aperiodicity, coarse_ap

    def world_synthesis_time_base_generation(temporal_positions, f0, fs, vuv,
    time_axis, default_f0):
    @@ -3273,16 +3282,46 @@ def fftfilt(b, x, *n):
    i += L
    return y

    def world_synthesis(temporal_positions_d4c, f0_d4c, vuv_d4c, aperiodicity_d4c,
    def world_synthesis(f0_d4c, vuv_d4c, aperiodicity_d4c,
    spectrogram_ct, fs_ct, random_seed=1999):

    fs = fs_ct
    # coarse -> fine aper
    if aperiodicity_d4c.shape[0] == 1:
    print("Coarse aperiodicity detected - interpolating to full size")
    aper = np.zeros_like(spectrogram_ct)
    coarse_aper_d4c = aperiodicity_d4c
    frequency_interval = 3000
    upper_limit = 15000
    number_of_aperiodicities = int(np.floor(np.min([upper_limit, fs / 2. - frequency_interval]) / float(frequency_interval)))
    coarse_axis = np.arange(number_of_aperiodicities + 2) * frequency_interval
    coarse_axis[-1] = fs / 2.
    f0_low_limit_for_spectrum = 71
    fft_size_for_spectrum = 2 ** np.ceil(np.log2(3 * fs / f0_low_limit_for_spectrum + 1.))
    frequency_axis = np.arange(int(fft_size_for_spectrum / 2) + 1) * float(fs) / fft_size_for_spectrum

    for i in range(len(f0_d4c)):
    ca = coarse_aper_d4c[0, i]
    cf = f0_d4c[i]
    coarse_aperiodicity = np.maximum(0, ca - (cf - 100) * 2. / 100.)
    piece = np.concatenate([[-60], -ca.ravel(), [-0.000000000001]])
    part = interp1d(coarse_axis, piece, kind="linear")(frequency_axis) / 20.
    aper[:, i] = 10 ** part
    aperiodicity_d4c = aper

    default_f0 = 500.
    random_state = np.random.RandomState(1999)
    fs = fs_ct
    spectrogram = spectrogram_ct
    aperiodicity = aperiodicity_d4c
    # max 30s, if greater than thrown an error
    max_len = 5000000
    _, temporal_positions = _world_get_temporal_positions(max_len, fs)
    temporal_positions = temporal_positions[:spectrogram.shape[1]]
    #temporal_positions = temporal_positions_d4c
    #from IPython import embed; embed()
    #raise ValueError()
    vuv = vuv_d4c
    f0 = f0_d4c
    temporal_positions = temporal_positions_d4c

    time_axis = np.arange(temporal_positions[0], temporal_positions[-1],
    1. / fs)
    @@ -3344,21 +3383,16 @@ def world_synthesis(temporal_positions_d4c, f0_d4c, vuv_d4c, aperiodicity_d4c,
    return y

    def run_world_example():
    #fs, d = wavfile.read("vaiueo2d_16k.wav")
    fs, d = fetch_sample_speech_tapestry()
    d = d.astype("float32") / 2 ** 15
    temporal_positions_h, f0_h, vuv_h, f0_candidates_h = harvest(d, fs)
    temporal_positions_ct, spectrogram_ct, fs_ct = cheaptrick(d, fs,
    temporal_positions_h, f0_h, vuv_h)
    temporal_positions_d4c, f0_d4c, vuv_d4c, aper_d4c = d4c(d, fs,
    temporal_positions_d4c, f0_d4c, vuv_d4c, aper_d4c, coarse_aper_d4c = d4c(d, fs,
    temporal_positions_h, f0_h, vuv_h)
    y = world_synthesis(temporal_positions_d4c, f0_d4c, vuv_d4c,
    aper_d4c, spectrogram_ct, fs_ct)
    y = world_synthesis(f0_d4c, vuv_d4c, aper_d4c, spectrogram_ct, fs_ct)
    wavfile.write("out.wav", fs, soundsc(y))

    #from IPython import embed; embed()
    #raise ValueError()

    if __name__ == "__main__":
    run_world_example()
    """
  10. kastnerkyle revised this gist Feb 19, 2017. 1 changed file with 3 additions and 3 deletions.
    6 changes: 3 additions & 3 deletions audio_tools.py
    Original file line number Diff line number Diff line change
    @@ -2860,7 +2860,7 @@ def harvest(x, fs):
    boundary_f0_list = adjusted_f0_floor * 2.0 ** boundary_f0_list
    y, actual_fs = harvest_get_downsampled_signal(x, fs, target_fs)
    fft_size = 2. ** np.ceil(np.log2(len(y) + np.round(fs / f0_floor * 4) + 1))
    y_spectrum = np.fft.fft(y, 8192)
    y_spectrum = np.fft.fft(y, int(fft_size))
    raw_f0_candidates = harvest_get_raw_f0_candidates(
    len(basic_temporal_positions),
    boundary_f0_list, len(y), basic_temporal_positions, actual_fs,
    @@ -3344,8 +3344,8 @@ def world_synthesis(temporal_positions_d4c, f0_d4c, vuv_d4c, aperiodicity_d4c,
    return y

    def run_world_example():
    fs, d = wavfile.read("vaiueo2d_16k.wav")
    #fs, d = fetch_sample_speech_tapestry()[:20000]
    #fs, d = wavfile.read("vaiueo2d_16k.wav")
    fs, d = fetch_sample_speech_tapestry()
    d = d.astype("float32") / 2 ** 15
    temporal_positions_h, f0_h, vuv_h, f0_candidates_h = harvest(d, fs)
    temporal_positions_ct, spectrogram_ct, fs_ct = cheaptrick(d, fs,
  11. kastnerkyle revised this gist Feb 18, 2017. 1 changed file with 71 additions and 4 deletions.
    75 changes: 71 additions & 4 deletions audio_tools.py
    Original file line number Diff line number Diff line change
    @@ -2,6 +2,7 @@
    # Authors: Kyle Kastner
    from __future__ import division
    import numpy as np

    """
    # TODO: Is this fixed or hosed
    # Do numpy version check for <= 1.9 ! Something crazy going on in 1.10
    @@ -3206,6 +3207,72 @@ def world_synthesis_get_spectral_parameters(temporal_positions,
    kind="linear", axis=0)(mmm.copy())
    return spectrum_slice, periodic_slice, aperiodic_slice

    """
    Filter data with an FIR filter using the overlap-add method.
    from http://projects.scipy.org/scipy/attachment/ticket/837/fftfilt.py
    """
    def nextpow2(x):
    """Return the first integer N such that 2**N >= abs(x)"""
    return np.ceil(np.log2(np.abs(x)))

    def fftfilt(b, x, *n):
    """Filter the signal x with the FIR filter described by the
    coefficients in b using the overlap-add method. If the FFT
    length n is not specified, it and the overlap-add block length
    are selected so as to minimize the computational cost of
    the filtering operation."""

    N_x = len(x)
    N_b = len(b)

    # Determine the FFT length to use:
    if len(n):
    # Use the specified FFT length (rounded up to the nearest
    # power of 2), provided that it is no less than the filter
    # length:
    n = n[0]
    if n != int(n) or n <= 0:
    raise ValueError('n must be a nonnegative integer')
    if n < N_b:
    n = N_b
    N_fft = 2**nextpow2(n)
    else:
    if N_x > N_b:
    # When the filter length is smaller than the signal,
    # choose the FFT length and block size that minimize the
    # FLOPS cost. Since the cost for a length-N FFT is
    # (N/2)*log2(N) and the filtering operation of each block
    # involves 2 FFT operations and N multiplications, the
    # cost of the overlap-add method for 1 length-N block is
    # N*(1+log2(N)). For the sake of efficiency, only FFT
    # lengths that are powers of 2 are considered:
    N = 2**np.arange(np.ceil(np.log2(N_b)),
    np.floor(np.log2(N_x)))
    cost = np.ceil(N_x/(N-N_b+1))*N*(np.log2(N)+1)
    N_fft = N[np.argmin(cost)]
    else:
    # When the filter length is at least as long as the signal,
    # filter the signal using a single block:
    N_fft = 2**nextpow2(N_b+N_x-1)

    N_fft = int(N_fft)

    # Compute the block length:
    L = int(N_fft - N_b + 1)

    # Compute the transform of the filter:
    H = np.fft.fft(b, N_fft)

    y = np.zeros(N_x, dtype=np.float32)
    i = 0
    while i <= N_x:
    il = min([i+L,N_x])
    k = min([i+N_fft,N_x])
    yt = np.fft.ifft(np.fft.fft(x[i:il],N_fft)*H,N_fft) # Overlap..
    y[i:k] = y[i:k] + yt[:k-i] # and add
    i += L
    return y

    def world_synthesis(temporal_positions_d4c, f0_d4c, vuv_d4c, aperiodicity_d4c,
    spectrogram_ct, fs_ct, random_seed=1999):
    default_f0 = 500.
    @@ -3273,12 +3340,12 @@ def world_synthesis(temporal_positions_d4c, f0_d4c, vuv_d4c, aperiodicity_d4c,
    response = np.fft.fftshift(np.real(np.fft.ifft(erc)))
    noise_input = random_state.randn(max([3, noise_size]),)

    a = 0 * noise_input + 1
    y[output_buffer_index] = y[output_buffer_index] + sg.lfilter(noise_input - np.mean(noise_input), a, response)
    y[output_buffer_index] = y[output_buffer_index] + fftfilt(noise_input - np.mean(noise_input), response)
    return y

    def run_world_example():
    fs, d = wavfile.read("vaiueo2d.wav")
    fs, d = wavfile.read("vaiueo2d_16k.wav")
    #fs, d = fetch_sample_speech_tapestry()[:20000]
    d = d.astype("float32") / 2 ** 15
    temporal_positions_h, f0_h, vuv_h, f0_candidates_h = harvest(d, fs)
    temporal_positions_ct, spectrogram_ct, fs_ct = cheaptrick(d, fs,
    @@ -3287,7 +3354,7 @@ def run_world_example():
    temporal_positions_h, f0_h, vuv_h)
    y = world_synthesis(temporal_positions_d4c, f0_d4c, vuv_d4c,
    aper_d4c, spectrogram_ct, fs_ct)
    wavfile.write("out.wav", 22050, soundsc(y))
    wavfile.write("out.wav", fs, soundsc(y))

    #from IPython import embed; embed()
    #raise ValueError()
  12. kastnerkyle revised this gist Feb 17, 2017. 1 changed file with 0 additions and 1 deletion.
    1 change: 0 additions & 1 deletion audio_tools.py
    Original file line number Diff line number Diff line change
    @@ -3277,7 +3277,6 @@ def world_synthesis(temporal_positions_d4c, f0_d4c, vuv_d4c, aperiodicity_d4c,
    y[output_buffer_index] = y[output_buffer_index] + sg.lfilter(noise_input - np.mean(noise_input), a, response)
    return y

    import matplotlib.pyplot as plt
    def run_world_example():
    fs, d = wavfile.read("vaiueo2d.wav")
    d = d.astype("float32") / 2 ** 15
  13. kastnerkyle revised this gist Feb 17, 2017. 1 changed file with 55 additions and 28 deletions.
    83 changes: 55 additions & 28 deletions audio_tools.py
    Original file line number Diff line number Diff line change
    @@ -2845,12 +2845,6 @@ def harvest_smooth_f0_contour(f0):
    smoothed_f0 = smoothed_f0[300:-300]
    return smoothed_f0

    from tempfile import mkdtemp
    cachedir = "joblib_cache"
    from sklearn.externals.joblib import Memory
    memory = Memory(cachedir=cachedir, verbose=0)

    @memory.cache
    def harvest(x, fs):
    f0_floor = 71
    f0_ceil = 800
    @@ -3186,12 +3180,11 @@ def world_synthesis_time_base_generation(temporal_positions, f0, fs, vuv,
    def world_synthesis_get_spectral_parameters(temporal_positions,
    temporal_position_index, spectrogram, amplitude_periodic,
    amplitude_random, pulse_locations):
    floor_index = int(np.floor(temporal_position_index))
    ceil_index = int(np.ceil(temporal_position_index))
    floor_index = int(np.floor(temporal_position_index) - 1)
    assert floor_index >= 0
    ceil_index = int(np.ceil(temporal_position_index) - 1)
    t1 = temporal_positions[floor_index]
    t2 = temporal_positions[ceil_index]
    from IPython import embed; embed()
    raise ValueError()

    if t1 == t2:
    spectrum_slice = spectrogram[:, floor_index]
    @@ -3200,24 +3193,23 @@ def world_synthesis_get_spectral_parameters(temporal_positions,
    else:
    cs = np.concatenate([spectrogram[:, floor_index][None],
    spectrogram[:, ceil_index][None]], axis=0)
    mms = np.maximum(t1, np.minimum(t2, pulse_locations))
    mmm = max([t1, min([t2, pulse_locations])])
    spectrum_slice = interp1d(np.array([t1, t2]), cs,
    kind="linear", axis=0)(mms)
    mmp = np.maximum(t1, np.minimum(t2, pulse_locations))
    kind="linear", axis=0)(mmm.copy())
    cp = np.concatenate([amplitude_periodic[:, floor_index][None],
    amplitude_periodic[:, ceil_index][None]], axis=0)
    periodic_slice = interp1d(np.array([t1, t2]), cs,
    kind="linear", axis=0)(mmp)
    mma = np.maximum(t1, np.minimum(t2, pulse_locations))
    periodic_slice = interp1d(np.array([t1, t2]), cp,
    kind="linear", axis=0)(mmm.copy())
    ca = np.concatenate([amplitude_random[:, floor_index][None],
    amplitude_random[:, ceil_index][None]], axis=0)
    aperiodic_slice = interp1d(np.array([t1, t2]), cs,
    kind="linear", axis=0)(mma)
    aperiodic_slice = interp1d(np.array([t1, t2]), ca,
    kind="linear", axis=0)(mmm.copy())
    return spectrum_slice, periodic_slice, aperiodic_slice

    def world_synthesis(temporal_positions_d4c, f0_d4c, vuv_d4c, aperiodicity_d4c,
    spectrogram_ct, fs_ct):
    spectrogram_ct, fs_ct, random_seed=1999):
    default_f0 = 500.
    random_state = np.random.RandomState(1999)
    fs = fs_ct
    spectrogram = spectrogram_ct
    aperiodicity = aperiodicity_d4c
    @@ -3232,10 +3224,10 @@ def world_synthesis(temporal_positions_d4c, f0_d4c, vuv_d4c, aperiodicity_d4c,
    time_axis, default_f0)
    pulse_locations, pulse_locations_index, interpolated_vuv = r
    fft_size = int((len(spectrogram) - 1) * 2)
    base_index = np.arange(-fft_size / 2 + 1, fft_size / 2 + 1)
    base_index = np.arange(-fft_size / 2, fft_size / 2) + 1
    y_length = len(y)
    tmp_complex_cepstrum = np.zeros((fft_size, 1), dtype=np.complex64)
    latter_index = np.arange(int(fft_size / 2) + 1, fft_size + 1)
    tmp_complex_cepstrum = np.zeros((fft_size,), dtype=np.complex128)
    latter_index = np.arange(int(fft_size / 2) + 1, fft_size + 1) - 1

    temporal_position_index = interp1d(temporal_positions, np.arange(1, len(temporal_positions) + 1), kind="linear", fill_value="extrapolate")(pulse_locations)
    temporal_postion_index = np.maximum(1, np.minimum(len(temporal_positions),
    @@ -3247,10 +3239,43 @@ def world_synthesis(temporal_positions_d4c, f0_d4c, vuv_d4c, aperiodicity_d4c,
    for i in range(len(pulse_locations_index)):
    spectrum_slice, periodic_slice, aperiodic_slice = world_synthesis_get_spectral_parameters(
    temporal_positions, temporal_position_index[i], spectrogram,
    amplitude_periodic, amplitude_aperiodic, pulse_locations)

    from IPython import embed; embed()
    raise ValueError()
    amplitude_periodic, amplitude_aperiodic, pulse_locations[i])
    idx = min(len(pulse_locations_index), i + 2) - 1
    noise_size = pulse_locations_index[idx] - pulse_locations_index[i]
    output_buffer_index = np.maximum(1, np.minimum(y_length, pulse_locations_index[i] + 1 + base_index)).astype("int32") - 1

    if interpolated_vuv[pulse_locations_index[i]] >= 0.5:
    tmp_periodic_spectrum = spectrum_slice * periodic_slice
    # eps in matlab/octave
    tmp_periodic_spectrum[tmp_periodic_spectrum == 0] = 2.2204E-16
    periodic_spectrum = np.concatenate([tmp_periodic_spectrum,
    tmp_periodic_spectrum[1:-1][::-1]])
    tmp_cepstrum = np.real(np.fft.fft(np.log(np.abs(periodic_spectrum)) / 2.))
    tmp_complex_cepstrum[latter_index] = tmp_cepstrum[latter_index] * 2
    tmp_complex_cepstrum[0] = tmp_cepstrum[0]

    response = np.fft.fftshift(np.real(np.fft.ifft(np.exp(np.fft.ifft(
    tmp_complex_cepstrum)))))
    y[output_buffer_index] += response * np.sqrt(
    max([1, noise_size]))
    tmp_aperiodic_spectrum = spectrum_slice * aperiodic_slice
    else:
    tmp_aperiodic_spectrum = spectrum_slice

    tmp_aperiodic_spectrum[tmp_aperiodic_spectrum == 0] = 2.2204E-16
    aperiodic_spectrum = np.concatenate([tmp_aperiodic_spectrum,
    tmp_aperiodic_spectrum[1:-1][::-1]])
    tmp_cepstrum = np.real(np.fft.fft(np.log(np.abs(aperiodic_spectrum)) / 2.))
    tmp_complex_cepstrum[latter_index] = tmp_cepstrum[latter_index] * 2
    tmp_complex_cepstrum[0] = tmp_cepstrum[0]
    rc = np.fft.ifft(tmp_complex_cepstrum)
    erc = np.exp(rc)
    response = np.fft.fftshift(np.real(np.fft.ifft(erc)))
    noise_input = random_state.randn(max([3, noise_size]),)

    a = 0 * noise_input + 1
    y[output_buffer_index] = y[output_buffer_index] + sg.lfilter(noise_input - np.mean(noise_input), a, response)
    return y

    import matplotlib.pyplot as plt
    def run_world_example():
    @@ -3263,8 +3288,10 @@ def run_world_example():
    temporal_positions_h, f0_h, vuv_h)
    y = world_synthesis(temporal_positions_d4c, f0_d4c, vuv_d4c,
    aper_d4c, spectrogram_ct, fs_ct)
    from IPython import embed; embed()
    raise ValueError()
    wavfile.write("out.wav", 22050, soundsc(y))

    #from IPython import embed; embed()
    #raise ValueError()

    if __name__ == "__main__":
    run_world_example()
  14. kastnerkyle revised this gist Feb 14, 2017. 1 changed file with 90 additions and 2 deletions.
    92 changes: 90 additions & 2 deletions audio_tools.py
    Original file line number Diff line number Diff line change
    @@ -3165,16 +3165,104 @@ def d4c(x, fs, temporal_positions_h, f0_h, vuv_h, threshold="default",
    aperiodicity[:, i] = 10 ** part
    return temporal_positions_h, f0_h, vuv_h, aperiodicity

    import matplotlib.pyplot as plt
    def world_synthesis_time_base_generation(temporal_positions, f0, fs, vuv,
    time_axis, default_f0):
    f0_interpolated_raw = interp1d(temporal_positions, f0, kind="linear",
    fill_value="extrapolate")(time_axis)
    vuv_interpolated = interp1d(temporal_positions, vuv, kind="linear",
    fill_value="extrapolate")(time_axis)
    vuv_interpolated = vuv_interpolated > 0.5
    f0_interpolated = f0_interpolated_raw * vuv_interpolated.astype("float32")
    f0_interpolated[f0_interpolated == 0] = f0_interpolated[f0_interpolated == 0] + default_f0
    total_phase = np.cumsum(2 * np.pi * f0_interpolated / float(fs))

    core = np.mod(total_phase, 2 * np.pi)
    core = np.abs(core[1:] - core[:-1])
    # account for diff, avoid deprecation warning with [:-1]
    pulse_locations = time_axis[:-1][core > (np.pi / 2.)]
    pulse_locations_index = np.round(pulse_locations * fs).astype("int32")
    return pulse_locations, pulse_locations_index, vuv_interpolated

    def world_synthesis_get_spectral_parameters(temporal_positions,
    temporal_position_index, spectrogram, amplitude_periodic,
    amplitude_random, pulse_locations):
    floor_index = int(np.floor(temporal_position_index))
    ceil_index = int(np.ceil(temporal_position_index))
    t1 = temporal_positions[floor_index]
    t2 = temporal_positions[ceil_index]
    from IPython import embed; embed()
    raise ValueError()

    if t1 == t2:
    spectrum_slice = spectrogram[:, floor_index]
    periodic_slice = amplitude_periodic[:, floor_index]
    aperiodic_slice = amplitude_random[:, floor_index]
    else:
    cs = np.concatenate([spectrogram[:, floor_index][None],
    spectrogram[:, ceil_index][None]], axis=0)
    mms = np.maximum(t1, np.minimum(t2, pulse_locations))
    spectrum_slice = interp1d(np.array([t1, t2]), cs,
    kind="linear", axis=0)(mms)
    mmp = np.maximum(t1, np.minimum(t2, pulse_locations))
    cp = np.concatenate([amplitude_periodic[:, floor_index][None],
    amplitude_periodic[:, ceil_index][None]], axis=0)
    periodic_slice = interp1d(np.array([t1, t2]), cs,
    kind="linear", axis=0)(mmp)
    mma = np.maximum(t1, np.minimum(t2, pulse_locations))
    ca = np.concatenate([amplitude_random[:, floor_index][None],
    amplitude_random[:, ceil_index][None]], axis=0)
    aperiodic_slice = interp1d(np.array([t1, t2]), cs,
    kind="linear", axis=0)(mma)
    return spectrum_slice, periodic_slice, aperiodic_slice

    def world_synthesis(temporal_positions_d4c, f0_d4c, vuv_d4c, aperiodicity_d4c,
    spectrogram_ct, fs_ct):
    default_f0 = 500.
    fs = fs_ct
    spectrogram = spectrogram_ct
    aperiodicity = aperiodicity_d4c
    vuv = vuv_d4c
    f0 = f0_d4c
    temporal_positions = temporal_positions_d4c

    time_axis = np.arange(temporal_positions[0], temporal_positions[-1],
    1. / fs)
    y = 0. * time_axis
    r = world_synthesis_time_base_generation(temporal_positions, f0, fs, vuv,
    time_axis, default_f0)
    pulse_locations, pulse_locations_index, interpolated_vuv = r
    fft_size = int((len(spectrogram) - 1) * 2)
    base_index = np.arange(-fft_size / 2 + 1, fft_size / 2 + 1)
    y_length = len(y)
    tmp_complex_cepstrum = np.zeros((fft_size, 1), dtype=np.complex64)
    latter_index = np.arange(int(fft_size / 2) + 1, fft_size + 1)

    temporal_position_index = interp1d(temporal_positions, np.arange(1, len(temporal_positions) + 1), kind="linear", fill_value="extrapolate")(pulse_locations)
    temporal_postion_index = np.maximum(1, np.minimum(len(temporal_positions),
    temporal_position_index)) - 1

    amplitude_aperiodic = aperiodicity ** 2
    amplitude_periodic = np.maximum(0.001, (1. - amplitude_aperiodic))

    for i in range(len(pulse_locations_index)):
    spectrum_slice, periodic_slice, aperiodic_slice = world_synthesis_get_spectral_parameters(
    temporal_positions, temporal_position_index[i], spectrogram,
    amplitude_periodic, amplitude_aperiodic, pulse_locations)

    from IPython import embed; embed()
    raise ValueError()

    import matplotlib.pyplot as plt
    def run_world_example():
    fs, d = wavfile.read("vaiueo2d.wav")
    d = d.astype("float32") / 2 ** 15
    temporal_positions_h, f0_h, vuv_h, f0_candidates_h = harvest(d, fs)
    temporal_positions_ct, spectrogram_ct, fs = cheaptrick(d, fs,
    temporal_positions_ct, spectrogram_ct, fs_ct = cheaptrick(d, fs,
    temporal_positions_h, f0_h, vuv_h)
    temporal_positions_d4c, f0_d4c, vuv_d4c, aper_d4c = d4c(d, fs,
    temporal_positions_h, f0_h, vuv_h)
    y = world_synthesis(temporal_positions_d4c, f0_d4c, vuv_d4c,
    aper_d4c, spectrogram_ct, fs_ct)
    from IPython import embed; embed()
    raise ValueError()

  15. kastnerkyle revised this gist Feb 14, 2017. 1 changed file with 110 additions and 8 deletions.
    118 changes: 110 additions & 8 deletions audio_tools.py
    Original file line number Diff line number Diff line change
    @@ -2846,7 +2846,7 @@ def harvest_smooth_f0_contour(f0):
    return smoothed_f0

    from tempfile import mkdtemp
    cachedir = "~/joblib_cache"
    cachedir = "joblib_cache"
    from sklearn.externals.joblib import Memory
    memory = Memory(cachedir=cachedir, verbose=0)

    @@ -3028,6 +3028,103 @@ def d4c_get_windowed_waveform(x, fs, current_f0, current_position, half_length,
    waveform = segment * window1 - window1 * np.mean(segment * window1) / np.mean(window1)
    return waveform

    def d4c_get_static_centroid(x, fs, current_f0, current_position, fft_size):
    waveform1 = d4c_get_windowed_waveform(x, fs, current_f0,
    current_position + 1. / current_f0 / 4., 2, 2)
    waveform2 = d4c_get_windowed_waveform(x, fs, current_f0,
    current_position - 1. / current_f0 / 4., 2, 2)
    centroid1 = d4c_get_centroid(waveform1, fft_size)
    centroid2 = d4c_get_centroid(waveform2, fft_size)
    centroid = d4c_dc_correction(centroid1 + centroid2, fs, fft_size,
    current_f0)
    return centroid

    def d4c_get_centroid(x, fft_size):
    fft_size = int(fft_size)
    time_axis = np.arange(1, len(x) + 1)
    x = x.copy()
    x = x / np.sqrt(np.sum(x ** 2))

    spectrum = np.fft.fft(x, fft_size)
    weighted_spectrum = np.fft.fft(-x * 1j * time_axis, fft_size)
    centroid = -(weighted_spectrum.imag) * spectrum.real + spectrum.imag * weighted_spectrum.real
    return centroid

    def d4c_dc_correction(signal, fs, fft_size, f0):
    fft_size = int(fft_size)
    frequency_axis = np.arange(fft_size) / fft_size * fs
    low_frequency_axis = frequency_axis[frequency_axis < f0 + fs / fft_size]
    low_frequency_replica = interp1d(f0 - low_frequency_axis,
    signal[frequency_axis < f0 + fs / fft_size],
    kind="linear",
    fill_value="extrapolate")(low_frequency_axis)
    idx = frequency_axis < f0
    signal[idx] = low_frequency_replica[idx[:len(low_frequency_replica)]] + signal[idx]
    signal[int(fft_size / 2.) + 1:] = signal[1 : int(fft_size / 2.)][::-1]
    return signal

    def d4c_linear_smoothing(group_delay, fs, fft_size, width):
    double_frequency_axis = np.arange(2 * fft_size) / float(fft_size ) * fs - fs
    double_spectrum = np.concatenate([group_delay, group_delay])

    double_segment = np.cumsum(double_spectrum * (fs / float(fft_size)))
    center_frequency = np.arange(int(fft_size / 2) + 1) / float(fft_size ) * fs
    low_levels = cheaptrick_interp1h(double_frequency_axis + fs / float(fft_size) / 2.,
    double_segment, center_frequency - width / 2.)
    high_levels = cheaptrick_interp1h(double_frequency_axis + fs / float(fft_size) / 2.,
    double_segment, center_frequency + width / 2.)
    smoothed_spectrum = (high_levels - low_levels) / width
    return smoothed_spectrum

    def d4c_get_smoothed_power_spectrum(waveform, fs, f0, fft_size):
    power_spectrum = np.abs(np.fft.fft(waveform, int(fft_size))) ** 2
    spectral_envelope = d4c_dc_correction(power_spectrum, fs, fft_size, f0)
    spectral_envelope = d4c_linear_smoothing(spectral_envelope, fs, fft_size, f0)
    spectral_envelope = np.concatenate([spectral_envelope,
    spectral_envelope[1:-1][::-1]])
    return spectral_envelope

    def d4c_get_static_group_delay(static_centroid, smoothed_power_spectrum, fs, f0,
    fft_size):
    group_delay = static_centroid / smoothed_power_spectrum
    group_delay = d4c_linear_smoothing(group_delay, fs, fft_size, f0 / 2.)
    group_delay = np.concatenate([group_delay, group_delay[1:-1][::-1]])
    smoothed_group_delay = d4c_linear_smoothing(group_delay, fs, fft_size, f0)
    group_delay = group_delay[:int(fft_size / 2) + 1] - smoothed_group_delay
    group_delay = np.concatenate([group_delay, group_delay[1:-1][::-1]])
    return group_delay

    def d4c_get_coarse_aperiodicity(group_delay, fs, fft_size,
    frequency_interval, number_of_aperiodicities, window1):
    boundary = np.round(fft_size / len(window1) * 8)
    half_window_length = np.floor(len(window1) / 2)
    coarse_aperiodicity = np.zeros((number_of_aperiodicities, 1))
    for i in range(1, number_of_aperiodicities + 1):
    center = np.floor(frequency_interval * i / (fs / float(fft_size)))
    segment = group_delay[int(center - half_window_length):int(center + half_window_length + 1)] * window1
    power_spectrum = np.abs(np.fft.fft(segment, int(fft_size))) ** 2
    cumulative_power_spectrum = np.cumsum(np.sort(power_spectrum[:int(fft_size / 2) + 1]))
    coarse_aperiodicity[i - 1] = -10 * np.log10(cumulative_power_spectrum[int(fft_size / 2 - boundary) - 1] / cumulative_power_spectrum[-1])
    return coarse_aperiodicity

    def d4c_estimate_one_slice(x, fs, current_f0, frequency_interval,
    current_position, fft_size, number_of_aperiodicities, window1):
    if current_f0 == 0:
    coarse_aperiodicity = np.zeros((number_of_aperiodicities, 1))
    return coarse_aperiodicity

    static_centroid = d4c_get_static_centroid(x, fs, current_f0,
    current_position, fft_size)
    waveform = d4c_get_windowed_waveform(x, fs, current_f0, current_position,
    2, 1)
    smoothed_power_spectrum = d4c_get_smoothed_power_spectrum(waveform, fs,
    current_f0, fft_size)
    static_group_delay = d4c_get_static_group_delay(static_centroid,
    smoothed_power_spectrum, fs, current_f0, fft_size)
    coarse_aperiodicity = d4c_get_coarse_aperiodicity(static_group_delay,
    fs, fft_size, frequency_interval, number_of_aperiodicities, window1)
    return coarse_aperiodicity

    def d4c(x, fs, temporal_positions_h, f0_h, vuv_h, threshold="default",
    fft_size="auto"):
    f0_low_limit = 47
    @@ -3044,14 +3141,13 @@ def d4c(x, fs, temporal_positions_h, f0_h, vuv_h, threshold="default",
    temporal_positions = temporal_positions_h.copy()
    f0[vuv_h == 0] = 0.

    number_of_aperiodicities = int(np.floor(np.min([upper_limit, fs / 2.]) / float(frequency_interval)))
    number_of_aperiodicities = int(np.floor(np.min([upper_limit, fs / 2. - frequency_interval]) / float(frequency_interval)))
    window_length = np.floor(frequency_interval / (fs / float(fft_size))) * 2 + 1
    window1 = harvest_nuttall(window_length)
    aperiodicity = np.zeros((int(fft_size_for_spectrum / 2) + 1, len(f0)))
    ap_debug = np.zeros((number_of_aperiodicities, len(f0)))

    frequency_axis = np.arange(int(fft_size_for_spectrum / 2) + 1) * float(fs) / fft_size_for_spectrum
    coarse_axis = np.arange(number_of_aperiodicities + 1) * frequency_interval
    coarse_axis = np.arange(number_of_aperiodicities + 2) * frequency_interval
    coarse_axis[-1] = fs / 2.

    for i in range(len(f0)):
    @@ -3060,9 +3156,14 @@ def d4c(x, fs, temporal_positions_h, f0_h, vuv_h, threshold="default",
    aperiodicity[:, i] = 1 - 0.000000000001
    continue
    current_f0 = max([f0_low_limit, f0[i]])
    from IPython import embed; embed()
    raise ValueError()

    coarse_aperiodicity = d4c_estimate_one_slice(x, fs, current_f0,
    frequency_interval, temporal_positions[i], fft_size,
    number_of_aperiodicities, window1)
    coarse_aperiodicity = np.maximum(0, coarse_aperiodicity - (current_f0 - 100) * 2. / 100.)
    piece = np.concatenate([[-60], -coarse_aperiodicity.ravel(), [-0.000000000001]])
    part = interp1d(coarse_axis, piece, kind="linear")(frequency_axis) / 20.
    aperiodicity[:, i] = 10 ** part
    return temporal_positions_h, f0_h, vuv_h, aperiodicity

    import matplotlib.pyplot as plt

    @@ -3072,7 +3173,8 @@ def run_world_example():
    temporal_positions_h, f0_h, vuv_h, f0_candidates_h = harvest(d, fs)
    temporal_positions_ct, spectrogram_ct, fs = cheaptrick(d, fs,
    temporal_positions_h, f0_h, vuv_h)
    s = d4c(d, fs, temporal_positions_h, f0_h, vuv_h)
    temporal_positions_d4c, f0_d4c, vuv_d4c, aper_d4c = d4c(d, fs,
    temporal_positions_h, f0_h, vuv_h)
    from IPython import embed; embed()
    raise ValueError()

  16. kastnerkyle revised this gist Feb 13, 2017. 1 changed file with 78 additions and 0 deletions.
    78 changes: 78 additions & 0 deletions audio_tools.py
    Original file line number Diff line number Diff line change
    @@ -2977,6 +2977,8 @@ def cheaptrick(x, fs, temporal_positions, f0_sequence,
    default_f0 = 500
    if fft_size == "auto":
    fft_size = int(2 ** np.ceil(np.log2(3. * float(fs) / f0_low_limit + 1)))
    else:
    raise ValueError("Only fft_size auto currently supported")
    f0_low_limit = fs * 3.0 / (fft_size - 3.0)
    f0_sequence[vuv == 0] = default_f0
    spectrogram = np.zeros((int(fft_size / 2.) + 1, len(f0_sequence)))
    @@ -2987,6 +2989,81 @@ def cheaptrick(x, fs, temporal_positions, f0_sequence,
    temporal_positions[i], fft_size, q1)
    return temporal_positions, spectrogram, fs

    def d4c_love_train(x, fs, current_f0, current_position, threshold):
    vuv = 0
    if current_f0 == 0:
    return vuv
    lowest_f0 = 40
    current_f0 = max([current_f0, lowest_f0])
    fft_size = int(2 ** np.ceil(np.log2(3. * fs / lowest_f0 + 1)))
    boundary0 = int(np.ceil(100 / (float(fs) / fft_size)))
    boundary1 = int(np.ceil(4000 / (float(fs) / fft_size)))
    boundary2 = int(np.ceil(7900 / (float(fs) / fft_size)))

    waveform = d4c_get_windowed_waveform(x, fs, current_f0, current_position,
    1.5, 2)
    power_spectrum = np.abs(np.fft.fft(waveform, int(fft_size)) ** 2)
    power_spectrum[0:boundary0 + 1] = 0.
    cumulative_spectrum = np.cumsum(power_spectrum)
    if (cumulative_spectrum[boundary1] / cumulative_spectrum[boundary2]) > threshold:
    vuv = 1
    return vuv

    def d4c_get_windowed_waveform(x, fs, current_f0, current_position, half_length,
    window_type):
    half_window_length = int(np.round(half_length * fs / current_f0))
    base_index = np.arange(-half_window_length, half_window_length + 1)
    index = np.round(current_position * fs + 0.001) + base_index + 1
    safe_index = np.minimum(len(x), np.maximum(1, np.round(index))).astype("int32") - 1

    segment = x[safe_index]
    time_axis = base_index / float(fs) / float(half_length)
    if window_type == 1:
    window1 = 0.5 * np.cos(np.pi * time_axis * current_f0) + 0.5
    elif window_type == 2:
    window1 = 0.08 * np.cos(np.pi * time_axis * current_f0 * 2)
    window1 += 0.5 * np.cos(np.pi * time_axis * current_f0) + 0.42
    else:
    raise ValueError("Unknown window type")
    waveform = segment * window1 - window1 * np.mean(segment * window1) / np.mean(window1)
    return waveform

    def d4c(x, fs, temporal_positions_h, f0_h, vuv_h, threshold="default",
    fft_size="auto"):
    f0_low_limit = 47
    if fft_size == "auto":
    fft_size = 2 ** np.ceil(np.log2(4. * fs / f0_low_limit + 1.))
    else:
    raise ValueError("Only fft_size auto currently supported")
    f0_low_limit_for_spectrum = 71
    fft_size_for_spectrum = 2 ** np.ceil(np.log2(3 * fs / f0_low_limit_for_spectrum + 1.))
    threshold = 0.85
    upper_limit = 15000
    frequency_interval = 3000
    f0 = f0_h.copy()
    temporal_positions = temporal_positions_h.copy()
    f0[vuv_h == 0] = 0.

    number_of_aperiodicities = int(np.floor(np.min([upper_limit, fs / 2.]) / float(frequency_interval)))
    window_length = np.floor(frequency_interval / (fs / float(fft_size))) * 2 + 1
    window1 = harvest_nuttall(window_length)
    aperiodicity = np.zeros((int(fft_size_for_spectrum / 2) + 1, len(f0)))
    ap_debug = np.zeros((number_of_aperiodicities, len(f0)))

    frequency_axis = np.arange(int(fft_size_for_spectrum / 2) + 1) * float(fs) / fft_size_for_spectrum
    coarse_axis = np.arange(number_of_aperiodicities + 1) * frequency_interval
    coarse_axis[-1] = fs / 2.

    for i in range(len(f0)):
    r = d4c_love_train(x, fs, f0[i], temporal_positions_h[i], threshold)
    if r == 0:
    aperiodicity[:, i] = 1 - 0.000000000001
    continue
    current_f0 = max([f0_low_limit, f0[i]])
    from IPython import embed; embed()
    raise ValueError()


    import matplotlib.pyplot as plt

    def run_world_example():
    @@ -2995,6 +3072,7 @@ def run_world_example():
    temporal_positions_h, f0_h, vuv_h, f0_candidates_h = harvest(d, fs)
    temporal_positions_ct, spectrogram_ct, fs = cheaptrick(d, fs,
    temporal_positions_h, f0_h, vuv_h)
    s = d4c(d, fs, temporal_positions_h, f0_h, vuv_h)
    from IPython import embed; embed()
    raise ValueError()

  17. kastnerkyle revised this gist Feb 13, 2017. 1 changed file with 111 additions and 4 deletions.
    115 changes: 111 additions & 4 deletions audio_tools.py
    Original file line number Diff line number Diff line change
    @@ -2845,6 +2845,12 @@ def harvest_smooth_f0_contour(f0):
    smoothed_f0 = smoothed_f0[300:-300]
    return smoothed_f0

    from tempfile import mkdtemp
    cachedir = "~/joblib_cache"
    from sklearn.externals.joblib import Memory
    memory = Memory(cachedir=cachedir, verbose=0)

    @memory.cache
    def harvest(x, fs):
    f0_floor = 71
    f0_ceil = 800
    @@ -2874,7 +2880,6 @@ def harvest(x, fs):

    connected_f0, vuv = harvest_fix_f0_contour(f0_candidates, f0_scores)
    smoothed_f0 = harvest_smooth_f0_contour(connected_f0)
    import matplotlib.pyplot as plt
    temporal_positions = np.arange(0,
    len(x) / float(fs),
    frame_period / float(1000))
    @@ -2884,12 +2889,114 @@ def harvest(x, fs):
    f0_candidates = f0_candidates
    return temporal_positions, f0, vuv, f0_candidates

    def cheaptrick_get_windowed_waveform(x, fs, current_f0, current_position):
    half_window_length = np.round(1.5 * fs / float(current_f0))
    base_index = np.arange(-half_window_length, half_window_length + 1)
    index = np.round(current_position * fs + 0.001) + base_index + 1
    safe_index = np.minimum(len(x), np.maximum(1, np.round(index))).astype("int32")
    safe_index = safe_index - 1
    segment = x[safe_index]
    time_axis = base_index / float(fs) / 1.5
    window1 = 0.5 * np.cos(np.pi * time_axis * float(current_f0)) + 0.5
    window1 = window1 / np.sqrt(np.sum(window1 ** 2))
    waveform = segment * window1 - window1 * np.mean(segment * window1) / np.mean(window1)
    return waveform

    def cheaptrick_get_power_spectrum(waveform, fs, fft_size, f0):
    power_spectrum = np.abs(np.fft.fft(waveform, fft_size)) ** 2
    frequency_axis = np.arange(fft_size) / float(fft_size) * float(fs)
    ind = frequency_axis < (f0 + fs / fft_size)
    low_frequency_axis = frequency_axis[ind]
    low_frequency_replica = interp1d(f0 - low_frequency_axis,
    power_spectrum[ind], kind="linear",
    fill_value="extrapolate")(low_frequency_axis)
    p1 = low_frequency_replica[(frequency_axis < f0)[:len(low_frequency_replica)]]
    p2 = power_spectrum[(frequency_axis < f0)[:len(power_spectrum)]]
    power_spectrum[frequency_axis < f0] = p1 + p2
    lb1 = int(fft_size / 2) + 1
    lb2 = 1
    ub2 = int(fft_size / 2)
    power_spectrum[lb1:] = power_spectrum[lb2:ub2][::-1]
    return power_spectrum

    def cheaptrick_linear_smoothing(power_spectrum, f0, fs, fft_size):
    double_frequency_axis = np.arange(2 * fft_size) / float(fft_size ) * fs - fs
    double_spectrum = np.concatenate([power_spectrum, power_spectrum])

    double_segment = np.cumsum(double_spectrum * (fs / float(fft_size)))
    center_frequency = np.arange(int(fft_size / 2) + 1) / float(fft_size ) * fs
    low_levels = cheaptrick_interp1h(double_frequency_axis + fs / float(fft_size) / 2.,
    double_segment, center_frequency - f0 / 3.)
    high_levels = cheaptrick_interp1h(double_frequency_axis + fs / float(fft_size) / 2.,
    double_segment, center_frequency + f0 / 3.)
    smoothed_spectrum = (high_levels - low_levels) * 1.5 / f0
    return smoothed_spectrum

    def cheaptrick_interp1h(x, y, xi):
    delta_x = float(x[1] - x[0])
    xi = np.maximum(x[0], np.minimum(x[-1], xi))
    xi_base = (np.floor((xi - x[0]) / delta_x)).astype("int32")
    xi_fraction = (xi - x[0]) / delta_x - xi_base
    delta_y = np.zeros_like(y)
    delta_y[:-1] = y[1:] - y[:-1]
    yi = y[xi_base] + delta_y[xi_base] * xi_fraction
    return yi

    def cheaptrick_smoothing_with_recovery(smoothed_spectrum, f0, fs, fft_size, q1):
    quefrency_axis = np.arange(fft_size) / float(fs)
    # 0 is NaN
    smoothing_lifter = np.sin(np.pi * f0 * quefrency_axis) / (np.pi * f0 * quefrency_axis)
    p = smoothing_lifter[1:int(fft_size / 2)][::-1].copy()
    smoothing_lifter[int(fft_size / 2) + 1:] = p
    smoothing_lifter[0] = 1.
    compensation_lifter = (1 - 2. * q1) + 2. * q1 * np.cos(2 * np.pi * quefrency_axis * f0)
    p = compensation_lifter[1:int(fft_size / 2)][::-1].copy()
    compensation_lifter[int(fft_size / 2) + 1:] = p
    tandem_cepstrum = np.fft.fft(np.log(smoothed_spectrum))
    tmp_spectral_envelope = np.exp(np.real(np.fft.ifft(tandem_cepstrum * smoothing_lifter * compensation_lifter)))
    spectral_envelope = tmp_spectral_envelope[:int(fft_size / 2) + 1]
    return spectral_envelope

    def cheaptrick_estimate_one_slice(x, fs, current_f0,
    current_position, fft_size, q1):
    waveform = cheaptrick_get_windowed_waveform(x, fs, current_f0,
    current_position)
    power_spectrum = cheaptrick_get_power_spectrum(waveform, fs, fft_size,
    current_f0)
    smoothed_spectrum = cheaptrick_linear_smoothing(power_spectrum, current_f0,
    fs, fft_size)
    comb_spectrum = np.concatenate([smoothed_spectrum, smoothed_spectrum[1:-1][::-1]])
    spectral_envelope = cheaptrick_smoothing_with_recovery(comb_spectrum,
    current_f0, fs, fft_size, q1)
    return spectral_envelope

    def cheaptrick(x, fs, temporal_positions, f0_sequence,
    vuv, fft_size="auto", q1=-0.15):
    f0_sequence = f0_sequence.copy()
    f0_low_limit = 71
    default_f0 = 500
    if fft_size == "auto":
    fft_size = int(2 ** np.ceil(np.log2(3. * float(fs) / f0_low_limit + 1)))
    f0_low_limit = fs * 3.0 / (fft_size - 3.0)
    f0_sequence[vuv == 0] = default_f0
    spectrogram = np.zeros((int(fft_size / 2.) + 1, len(f0_sequence)))
    for i in range(len(f0_sequence)):
    if f0_sequence[i] < f0_low_limit:
    f0_sequence[i] = default_d0
    spectrogram[:, i] = cheaptrick_estimate_one_slice(x, fs, f0_sequence[i],
    temporal_positions[i], fft_size, q1)
    return temporal_positions, spectrogram, fs

    import matplotlib.pyplot as plt

    def run_world_example():
    fs, d = wavfile.read("vaiueo2d.wav")
    d = d.astype("float32") / 2 ** 15
    temporal_positions, f0, vuv, f0_candidates = harvest(d, fs)


    temporal_positions_h, f0_h, vuv_h, f0_candidates_h = harvest(d, fs)
    temporal_positions_ct, spectrogram_ct, fs = cheaptrick(d, fs,
    temporal_positions_h, f0_h, vuv_h)
    from IPython import embed; embed()
    raise ValueError()

    if __name__ == "__main__":
    run_world_example()
  18. kastnerkyle revised this gist Feb 13, 2017. 1 changed file with 180 additions and 12 deletions.
    192 changes: 180 additions & 12 deletions audio_tools.py
    Original file line number Diff line number Diff line change
    @@ -2662,29 +2662,188 @@ def harvest_fix_step_1(f0_base, allowed_range):
    def harvest_fix_step_2(f0_step1, voice_range_minimum):
    f0_step2 = f0_step1.copy()
    boundary_list = harvest_get_boundary_list(f0_step1)
    from IPython import embed; embed()
    raise ValueError()

    for i in range(1, int(len(boundary_list) / 2.) + 1):
    distance = boundary_list[(2 * i) - 1] - boundary_list[(2 * i) - 2]
    if distance < voice_range_minimum:
    # subtleties of numpy assignment?
    # need one more due to range not including last index
    lb = boundary_list[(2 * i) - 2]
    ub = boundary_list[(2 * i) - 1] + 1
    f0_step2[lb:ub] = 0.
    return f0_step2

    def harvest_fix_step_3(f0_step2, f0_candidates, allowed_range, f0_scores):
    f0_step3 = f0_step2.copy()
    boundary_list = harvest_get_boundary_list(f0_step2)
    multichannel_f0 = harvest_get_multichannel_f0(f0_step2, boundary_list)
    rrange = np.zeros((int(len(boundary_list) / 2), 2))
    threshold1 = 100
    threshold2 = 2200
    count = 0
    for i in range(1, int(len(boundary_list) / 2) + 1):
    extended_f0, tmp_range_1 = harvest_extend_f0(multichannel_f0[i - 1, :],
    boundary_list[(2 * i) - 1],
    min([len(f0_step2) - 1, boundary_list[(2 * i) - 1] + threshold1]),
    1, f0_candidates, allowed_range)
    tmp_f0_sequence, tmp_range_0 = harvest_extend_f0(extended_f0,
    boundary_list[(2 * i) - 2],
    max([2, boundary_list[(2 * i) - 2] - threshold1]), -1,
    f0_candidates, allowed_range)

    mean_f0 = np.mean(tmp_f0_sequence[tmp_range_0 : tmp_range_1 + 1])
    if threshold2 / mean_f0 < (tmp_range_1 - tmp_range_0):
    multichannel_f0[count, :] = tmp_f0_sequence
    rrange[count, :] = np.array([tmp_range_0, tmp_range_1])
    count = count + 1
    if count > 0:
    multichannel_f0 = multichannel_f0[:count, :]
    rrange = rrange[:count, :]
    f0_step3 = harvest_merge_f0(multichannel_f0, rrange, f0_candidates,
    f0_scores)
    return f0_step3

    def harvest_merge_f0(multichannel_f0, rrange, f0_candidates, f0_scores):
    number_of_channels = len(multichannel_f0)
    sorted_order = np.argsort(rrange[:, 0])
    f0 = multichannel_f0[sorted_order[0], :]
    for i in range(1, number_of_channels):
    if rrange[sorted_order[i], 0] - rrange[sorted_order[0], 1] > 0:
    # no overlapping
    f0[rrange[sorted_order[i], 0]:rrange[sorted_order[i], 1]] = multichannel_f0[sorted_order[i], rrange[sorted_order[i], 0]:rrange[sorted_order[i], 1]]
    cp = rrange.copy()
    rrange[sorted_order[0], 0] = cp[sorted_order[i], 0]
    rrange[sorted_order[0], 1] = cp[sorted_order[i], 1]
    else:
    cp = rrange.copy()
    res = harvest_merge_f0_sub(f0, cp[sorted_order[0], 0],
    cp[sorted_order[0], 1],
    multichannel_f0[sorted_order[i], :],
    cp[sorted_order[i], 0],
    cp[sorted_order[i], 1], f0_candidates, f0_scores)
    f0 = res[0]
    rrange[sorted_order[0], 1] = res[1]
    return f0

    def harvest_merge_f0_sub(f0_1, st1, ed1, f0_2, st2, ed2, f0_candidates,
    f0_scores):
    merged_f0 = f0_1
    if (st1 <= st2) and (ed1 >= ed2):
    new_ed = ed1
    return merged_f0, new_ed
    new_ed = ed2

    score1 = 0.
    score2 = 0.
    for i in range(int(st2), int(ed1) + 1):
    score1 = score1 + harvest_serach_score(f0_1[i], f0_candidates[:, i], f0_scores[:, i])
    score2 = score2 + harvest_serach_score(f0_2[i], f0_candidates[:, i], f0_scores[:, i])
    if score1 > score2:
    merged_f0[int(ed1):int(ed2) + 1] = f0_2[int(ed1):int(ed2) + 1]
    else:
    merged_f0[int(st2):int(ed2) + 1] = f0_2[int(st2):int(ed2) + 1]
    return merged_f0, new_ed

    def harvest_serach_score(f0, f0_candidates, f0_scores):
    score = 0
    for i in range(len(f0_candidates)):
    if (f0 == f0_candidates[i]) and (score < f0_scores[i]):
    score = f0_scores[i]
    return score

    def harvest_extend_f0(f0, origin, last_point, shift, f0_candidates,
    allowed_range):
    threshold = 4
    extended_f0 = f0.copy()
    tmp_f0 = extended_f0[origin]
    shifted_origin = origin
    count = 0
    for i in np.arange(origin, last_point + shift, shift):
    bf0, bs = harvest_select_best_f0(tmp_f0,
    f0_candidates[:, i + shift], allowed_range)
    extended_f0[i + shift] = bf0
    if extended_f0[i + shift] != 0:
    tmp_f0 = extended_f0[i + shift]
    count = 0
    shifted_origin = i + shift
    else:
    count = count + 1
    if count == threshold:
    break
    return extended_f0, shifted_origin

    def harvest_get_multichannel_f0(f0, boundary_list):
    multichannel_f0 = np.zeros((int(len(boundary_list) / 2), len(f0)))
    for i in range(1, int(len(boundary_list) / 2) + 1):
    sl = boundary_list[(2 * i) - 2]
    el = boundary_list[(2 * i) - 1] + 1
    multichannel_f0[i - 1, sl:el] = f0[sl:el]
    return multichannel_f0

    def harvest_get_boundary_list(f0):
    vuv = f0.copy()
    vuv[vuv != 0] = 1.
    vuv[0] = 0
    vuv[-1] = 0
    diff_vuv = vuv[1:] - vuv[:-1]
    boundary_list = np.where(vuv != 0)[0]
    boundary_list = np.where(diff_vuv != 0)[0]
    boundary_list[::2] = boundary_list[::2] + 1
    return boundary_list

    def harvest_fix_step_4(f0_step3, threshold):
    f0_step4 = f0_step3.copy()
    boundary_list = harvest_get_boundary_list(f0_step3)

    for i in range(1, int(len(boundary_list) / 2.)):
    distance = boundary_list[(2 * i)] - boundary_list[(2 * i) - 1] - 1
    if distance >= threshold:
    continue
    boundary0 = f0_step3[boundary_list[(2 * i) - 1]] + 1
    boundary1 = f0_step3[boundary_list[(2 * i)]] - 1
    coefficient = (boundary1 - boundary0) / float((distance + 1))
    count = 1
    st = boundary_list[(2 * i) - 1] + 1
    ed = boundary_list[(2 * i)]
    for j in range(st, ed):
    f0_step4[j] = boundary0 + coefficient * count
    count = count + 1
    return f0_step4

    def harvest_fix_f0_contour(f0_candidates, f0_scores):
    f0_base = harvest_search_f0_base(f0_candidates, f0_scores)
    f0_step1 = harvest_fix_step_1(f0_base, 0.008) # optimized?
    from IPython import embed; embed()
    raise ValueError()
    f0_step2 = harvest_fix_step_2(f0_step1, 6) # optimized?
    from IPython import embed; embed()
    raise ValueError()

    import matplotlib.pyplot as plt
    f0_step3 = harvest_fix_step_3(f0_step2, f0_candidates, 0.18, f0_scores) # optimized?
    f0 = harvest_fix_step_4(f0_step3, 9) # optimized
    vuv = f0.copy()
    vuv[vuv != 0] = 1.
    return f0, vuv

    def harvest_filter_f0_contour(f0, st, ed, b, a):
    smoothed_f0 = f0.copy()
    smoothed_f0[:st] = smoothed_f0[st]
    smoothed_f0[ed + 1:] = smoothed_f0[ed]
    aaa = sg.lfilter(b, a, smoothed_f0)
    bbb = sg.lfilter(b, a, aaa[::-1])
    smoothed_f0 = bbb[::-1].copy()
    smoothed_f0[:st] = 0.
    smoothed_f0[ed + 1:] = 0.
    return smoothed_f0

    def harvest_smooth_f0_contour(f0):
    b = np.array([0.0078202080334971724, 0.015640416066994345, 0.0078202080334971724])
    a = np.array([1.0, -1.7347257688092754, 0.76600660094326412])
    smoothed_f0 = np.concatenate([np.zeros(300,), f0, np.zeros(300,)])
    boundary_list = harvest_get_boundary_list(smoothed_f0)
    multichannel_f0 = harvest_get_multichannel_f0(smoothed_f0, boundary_list)
    for i in range(1, int(len(boundary_list) / 2) + 1):
    tmp_f0_contour = harvest_filter_f0_contour(multichannel_f0[i - 1, :],
    boundary_list[(2 * i) - 2], boundary_list[(2 * i) - 1], b, a)
    st = boundary_list[(2 * i) - 2]
    ed = boundary_list[(2 * i) - 1] + 1
    smoothed_f0[st:ed] = tmp_f0_contour[st:ed]
    smoothed_f0 = smoothed_f0[300:-300]
    return smoothed_f0

    def harvest(x, fs):
    f0_floor = 71
    @@ -2714,13 +2873,22 @@ def harvest(x, fs):
    f0_candidates, f0_scores = harvest_remove_unreliable_candidates(f0_candidates, f0_scores)

    connected_f0, vuv = harvest_fix_f0_contour(f0_candidates, f0_scores)
    from IPython import embed; embed()
    raise ValueError()
    smoothed_f0 = harvest_smooth_f0_contour(connected_f0)
    import matplotlib.pyplot as plt
    temporal_positions = np.arange(0,
    len(x) / float(fs),
    frame_period / float(1000))
    idx = np.minimum(len(smoothed_f0) - 1, np.round(temporal_positions * 1000)).astype("int32")
    f0 = smoothed_f0[idx]
    vuv = vuv[idx]
    f0_candidates = f0_candidates
    return temporal_positions, f0, vuv, f0_candidates

    def run_world_example():
    fs, d = wavfile.read("vaiueo2d.wav")
    d = d.astype("float32") / 2 ** 15
    harvest(d, fs)
    temporal_positions, f0, vuv, f0_candidates = harvest(d, fs)



    if __name__ == "__main__":
  19. kastnerkyle revised this gist Feb 12, 2017. 1 changed file with 78 additions and 2 deletions.
    80 changes: 78 additions & 2 deletions audio_tools.py
    Original file line number Diff line number Diff line change
    @@ -2581,7 +2581,7 @@ def harvest_get_refined_f0(x, fs, current_time, current_f0, f0_floor,
    ext = np.zeros((len(main_window) + 2))
    ext[1:-1] = main_window
    diff_window = -((ext[1:-1] - ext[:-2]) + (ext[2:] - ext[1:-1])) / float(2)
    safe_index = np.maximum(0, np.minimum(len(x) - 1, base_index)).astype("int32")
    safe_index = np.maximum(1, np.minimum(len(x), base_index)).astype("int32") - 1
    spectrum = np.fft.fft(x[safe_index] * main_window, fft_size)
    diff_spectrum = np.fft.fft(x[safe_index] * diff_window, fft_size)
    numerator_i = np.real(spectrum) * np.imag(diff_spectrum) - np.imag(spectrum) * np.real(diff_spectrum)
    @@ -2594,7 +2594,7 @@ def harvest_get_refined_f0(x, fs, current_time, current_f0, f0_floor,
    instantaneous_frequency_list = instantaneous_frequency[index_list]
    amplitude_list = np.sqrt(power_spectrum[index_list])
    refined_f0 = np.sum(amplitude_list * instantaneous_frequency_list)
    refined_f0 /= np.sum(amplitude_list * harmonics_index)
    refined_f0 /= np.sum(amplitude_list * harmonics_index.astype("float32"))

    variation = np.abs(((instantaneous_frequency_list / harmonics_index.astype("float32")) - current_f0) / float(current_f0))
    refined_score = 1. / (0.000000000001 + np.mean(variation))
    @@ -2604,8 +2604,83 @@ def harvest_get_refined_f0(x, fs, current_time, current_f0, f0_floor,
    redined_score = 0.
    return refined_f0, refined_score

    def harvest_select_best_f0(reference_f0, f0_candidates, allowed_range):
    best_f0 = 0
    best_error = allowed_range

    for i in range(len(f0_candidates)):
    tmp = np.abs(reference_f0 - f0_candidates[i]) / reference_f0
    if tmp > best_error:
    continue
    best_f0 = f0_candidates[i]
    best_error = tmp
    return best_f0, best_error

    def harvest_remove_unreliable_candidates(f0_candidates, f0_scores):
    new_f0_candidates = f0_candidates.copy()
    new_f0_scores = f0_scores.copy()
    threshold = 0.05
    f0_length = f0_candidates.shape[1]
    number_of_candidates = len(f0_candidates)

    for i in range(1, f0_length - 1):
    for j in range(number_of_candidates):
    reference_f0 = f0_candidates[j, i]
    if reference_f0 == 0:
    continue
    _, min_error1 = harvest_select_best_f0(reference_f0, f0_candidates[:, i + 1], 1)
    _, min_error2 = harvest_select_best_f0(reference_f0, f0_candidates[:, i - 1], 1)
    min_error = min([min_error1, min_error2])
    if min_error > threshold:
    new_f0_candidates[j, i] = 0
    new_f0_scores[j, i] = 0
    return new_f0_candidates, new_f0_scores

    def harvest_search_f0_base(f0_candidates, f0_scores):
    f0_base = f0_candidates[0, :] * 0.
    for i in range(len(f0_base)):
    max_index = np.argmax(f0_scores[:, i])
    f0_base[i] = f0_candidates[max_index, i]
    return f0_base

    def harvest_fix_step_1(f0_base, allowed_range):
    # Step 1: Rapid change of f0 contour is replaced by 0
    f0_step1 = f0_base.copy()
    f0_step1[0] = 0.
    f0_step1[1] = 0.

    for i in range(2, len(f0_base)):
    if f0_base[i] == 0:
    continue
    reference_f0 = f0_base[i - 1] * 2 - f0_base[i - 2]
    c1 = np.abs((f0_base[i] - reference_f0) / reference_f0) > allowed_range
    c2 = np.abs((f0_base[i] - f0_base[i - 1]) / f0_base[i - 1]) > allowed_range
    if c1 and c2:
    f0_step1[i] = 0.
    return f0_step1

    def harvest_fix_step_2(f0_step1, voice_range_minimum):
    f0_step2 = f0_step1.copy()
    boundary_list = harvest_get_boundary_list(f0_step1)
    from IPython import embed; embed()
    raise ValueError()

    def harvest_get_boundary_list(f0):
    vuv = f0.copy()
    vuv[vuv != 0] = 1.
    vuv[0] = 0
    vuv[-1] = 0
    diff_vuv = vuv[1:] - vuv[:-1]
    boundary_list = np.where(vuv != 0)[0]
    boundary_list[::2] = boundary_list[::2] + 1
    return boundary_list

    def harvest_fix_f0_contour(f0_candidates, f0_scores):
    f0_base = harvest_search_f0_base(f0_candidates, f0_scores)
    f0_step1 = harvest_fix_step_1(f0_base, 0.008) # optimized?
    from IPython import embed; embed()
    raise ValueError()
    f0_step2 = harvest_fix_step_2(f0_step1, 6) # optimized?
    from IPython import embed; embed()
    raise ValueError()

    @@ -2638,6 +2713,7 @@ def harvest(x, fs):

    f0_candidates, f0_scores = harvest_remove_unreliable_candidates(f0_candidates, f0_scores)

    connected_f0, vuv = harvest_fix_f0_contour(f0_candidates, f0_scores)
    from IPython import embed; embed()
    raise ValueError()

  20. kastnerkyle revised this gist Feb 12, 2017. 1 changed file with 118 additions and 8 deletions.
    126 changes: 118 additions & 8 deletions audio_tools.py
    Original file line number Diff line number Diff line change
    @@ -2445,7 +2445,7 @@ def harvest_get_f0_candidate_from_raw_event(boundary_f0,
    band_pass_filter = band_pass_filter_base * shifter

    index_bias = filter_length_half
    # numerical issues if 32 bit?
    # possible numerical issues if 32 bit
    spectrum_low_pass_filter = np.fft.fft(band_pass_filter.astype("float64"), len(y_spectrum))
    filtered_signal = np.real(np.fft.ifft(spectrum_low_pass_filter * y_spectrum))
    index_bias = filter_length_half + 1
    @@ -2457,8 +2457,11 @@ def harvest_get_f0_candidate_from_raw_event(boundary_f0,
    dip = harvest_zero_crossing_engine(-d_filtered_signal, fs)
    f0_candidate = harvest_get_f0_candidate_contour(negative_zero_cross,
    positive_zero_cross, peak, dip, temporal_positions)
    from IPython import embed; embed()
    raise ValueError()
    f0_candidate[f0_candidate > (boundary_f0 * 1.1)] = 0.
    f0_candidate[f0_candidate < (boundary_f0 * .9)] = 0.
    f0_candidate[f0_candidate > f0_ceil] = 0.
    f0_candidate[f0_candidate < f0_floor] = 0.
    return f0_candidate

    def harvest_get_f0_candidate_contour(negative_zero_cross_tup, positive_zero_cross_tup, peak_tup, dip_tup, temporal_positions):
    # 0 is inteval locations
    @@ -2486,7 +2489,6 @@ def harvest_get_f0_candidate_contour(negative_zero_cross_tup, positive_zero_cros
    f0_candidate = temporal_positions * 0
    return f0_candidate


    def harvest_zero_crossing_engine(x, fs, debug=False):
    # negative zero crossing, going from positive to negative
    x_shift = x.copy()
    @@ -2502,6 +2504,111 @@ def harvest_zero_crossing_engine(x, fs, debug=False):
    interval_based_f0 = float(fs) / (fine_edge_list[1:] - fine_edge_list[:-1])
    return interval_locations, interval_based_f0

    def harvest_detect_official_f0_candidates(raw_f0_candidates):
    number_of_channels, number_of_frames = raw_f0_candidates.shape
    f0_candidates = np.zeros((int(np.round(number_of_channels / 10.)), number_of_frames))
    number_of_candidates = 0
    threshold = 10
    for i in range(number_of_frames):
    tmp = raw_f0_candidates[:, i].copy()
    tmp[tmp > 0] = 1.
    tmp[0] = 0
    tmp[-1] = 0
    tmp = tmp[1:] - tmp[:-1]
    st = np.where(tmp == 1)[0]
    ed = np.where(tmp == -1)[0]
    count = 0
    for j in range(len(st)):
    dif = ed[j] - st[j]
    if dif >= threshold:
    tmp_f0 = raw_f0_candidates[st[j] + 1: ed[j] + 1, i]
    f0_candidates[count, i] = np.mean(tmp_f0)
    count = count + 1
    number_of_candidates = max(number_of_candidates, count)
    return f0_candidates, number_of_candidates

    def harvest_overlap_f0_candidates(f0_candidates, max_number_of_f0_candidates):
    n = 3 # this is the optimized parameter... apparently
    number_of_candidates = n * 2 + 1
    new_f0_candidates = f0_candidates[number_of_candidates, :].copy()
    new_f0_candidates = new_f0_candidates[None]
    # hack to bypass magic matlab-isms of allocating when indexing OOB
    new_f0_candidates = np.vstack([new_f0_candidates] + (new_f0_candidates.shape[-1] - 1) * [np.zeros_like(new_f0_candidates)])
    # this indexing is megagross, possible source for bugs!
    all_nonzero = []
    for i in range(number_of_candidates):
    st = max(-(i - n), 0)
    ed = min(-(i - n), 0)
    f1_b = np.arange(max_number_of_f0_candidates).astype("int32")
    f1 = f1_b + int(i * max_number_of_f0_candidates)
    all_nonzero = list(set(all_nonzero + list(f1)))
    f2 = None if ed == 0 else ed
    f3 = -ed
    f4 = None if st == 0 else -st
    new_f0_candidates[f1, st:f2] = f0_candidates[f1_b, f3:f4]
    new_f0_candidates = new_f0_candidates[all_nonzero, :]
    return new_f0_candidates

    def harvest_refine_candidates(x, fs, temporal_positions, f0_candidates,
    f0_floor, f0_ceil):
    new_f0_candidates = f0_candidates.copy()
    f0_scores = f0_candidates * 0.
    for i in range(len(temporal_positions)):
    for j in range(len(f0_candidates)):
    tmp_f0 = f0_candidates[j, i]
    if tmp_f0 == 0:
    continue
    res = harvest_get_refined_f0(x, fs, temporal_positions[i],
    tmp_f0, f0_floor, f0_ceil)
    new_f0_candidates[j, i] = res[0]
    f0_scores[j, i] = res[1]
    return new_f0_candidates, f0_scores

    def harvest_get_refined_f0(x, fs, current_time, current_f0, f0_floor,
    f0_ceil):
    half_window_length = np.ceil(3. * fs / current_f0 / 2.)
    window_length_in_time = (2. * half_window_length + 1) / float(fs)
    base_time = np.arange(-half_window_length, half_window_length + 1) / float(fs)
    fft_size = int(2 ** np.ceil(np.log2((half_window_length * 2 + 1)) + 1))
    frequency_axis = np.arange(fft_size) / fft_size * float(fs)

    base_index = np.round((current_time + base_time) * fs + 0.001)
    index_time = (base_index - 1) / float(fs)
    window_time = index_time - current_time
    part1 = np.cos(2 * np.pi * window_time / window_length_in_time)
    part2 = np.cos(4 * np.pi * window_time / window_length_in_time)
    main_window = 0.42 + 0.5 * part1 + 0.08 * part2
    ext = np.zeros((len(main_window) + 2))
    ext[1:-1] = main_window
    diff_window = -((ext[1:-1] - ext[:-2]) + (ext[2:] - ext[1:-1])) / float(2)
    safe_index = np.maximum(0, np.minimum(len(x) - 1, base_index)).astype("int32")
    spectrum = np.fft.fft(x[safe_index] * main_window, fft_size)
    diff_spectrum = np.fft.fft(x[safe_index] * diff_window, fft_size)
    numerator_i = np.real(spectrum) * np.imag(diff_spectrum) - np.imag(spectrum) * np.real(diff_spectrum)
    power_spectrum = np.abs(spectrum) ** 2
    instantaneous_frequency = frequency_axis + numerator_i / power_spectrum * float(fs) / 2. / np.pi

    number_of_harmonics = int(min(np.floor(float(fs) / 2. / current_f0), 6.))
    harmonics_index = np.arange(number_of_harmonics) + 1
    index_list = np.round(current_f0 * fft_size / fs * harmonics_index).astype("int32")
    instantaneous_frequency_list = instantaneous_frequency[index_list]
    amplitude_list = np.sqrt(power_spectrum[index_list])
    refined_f0 = np.sum(amplitude_list * instantaneous_frequency_list)
    refined_f0 /= np.sum(amplitude_list * harmonics_index)

    variation = np.abs(((instantaneous_frequency_list / harmonics_index.astype("float32")) - current_f0) / float(current_f0))
    refined_score = 1. / (0.000000000001 + np.mean(variation))

    if (refined_f0 < f0_floor) or (refined_f0 > f0_ceil) or (refined_score < 2.5):
    refined_f0 = 0.
    redined_score = 0.
    return refined_f0, refined_score


    def harvest_remove_unreliable_candidates(f0_candidates, f0_scores):
    from IPython import embed; embed()
    raise ValueError()

    import matplotlib.pyplot as plt

    def harvest(x, fs):
    @@ -2524,13 +2631,16 @@ def harvest(x, fs):
    boundary_f0_list, len(y), basic_temporal_positions, actual_fs,
    y_spectrum, f0_floor, f0_ceil)

    # raw_f0_candidates = GetRawF0Candidates(...
    # length(basic_temporal_positions), boundary_f0_list, length(y),...
    # basic_temporal_positions, actual_fs, y_spectrum, f0_floor, f0_ceil);
    f0_candidates, number_of_candidates = harvest_detect_official_f0_candidates(raw_f0_candidates)
    f0_candidates = harvest_overlap_f0_candidates(f0_candidates, number_of_candidates)
    f0_candidates, f0_scores = harvest_refine_candidates(y, actual_fs,
    basic_temporal_positions, f0_candidates, f0_floor, f0_ceil)

    f0_candidates, f0_scores = harvest_remove_unreliable_candidates(f0_candidates, f0_scores)

    from IPython import embed; embed()
    raise ValueError()


    def run_world_example():
    fs, d = wavfile.read("vaiueo2d.wav")
    d = d.astype("float32") / 2 ** 15
  21. kastnerkyle revised this gist Feb 12, 2017. 1 changed file with 142 additions and 2 deletions.
    144 changes: 142 additions & 2 deletions audio_tools.py
    Original file line number Diff line number Diff line change
    @@ -14,6 +14,7 @@
    """
    from numpy.lib.stride_tricks import as_strided
    import scipy.signal as sg
    from scipy.interpolate import interp1d
    import wave
    from scipy.cluster.vq import vq
    from scipy import linalg, fftpack
    @@ -626,6 +627,13 @@ def rolling_mean(X, window_size):
    return np.correlate(X, w, 'valid')


    def rolling_window(X, window_size):
    # for 1d data
    shape = X.shape[:-1] + (X.shape[-1] - window_size + 1, window_size)
    strides = X.strides + (X.strides[-1],)
    return np.lib.stride_tricks.as_strided(X, shape=shape, strides=strides)


    def voiced_unvoiced(X, window_size=256, window_step=128, copy=True):
    """
    Voiced unvoiced detection from a raw signal
    @@ -2395,14 +2403,146 @@ def run_fft_dct_example():
    wavfile.write("fftdct_orig.wav", fs, soundsc(X))
    wavfile.write("fftdct_rec.wav", fs, soundsc(X_r))

    def harvest_get_downsampled_signal(x, fs, target_fs):
    decimation_ratio = np.round(fs / target_fs)
    offset = np.ceil(140. / decimation_ratio) * decimation_ratio
    start_pad = x[0] * np.ones(int(offset), dtype=np.float32)
    end_pad = x[-1] * np.ones(int(offset), dtype=np.float32)
    x = np.concatenate((start_pad, x, end_pad), axis=0)

    if fs < target_fs:
    raise ValueError("CASE NOT HANDLED IN harvest_get_downsampled_signal")
    else:
    y0 = sg.decimate(x, int(decimation_ratio), 3, zero_phase=True)
    actual_fs = fs / decimation_ratio
    y = y0[int(offset / decimation_ratio):-int(offset / decimation_ratio)]
    y = y - np.mean(y)
    return y, actual_fs

    def harvest_get_raw_f0_candidates(number_of_frames, boundary_f0_list,
    y_length, temporal_positions, actual_fs, y_spectrum, f0_floor,
    f0_ceil):
    raw_f0_candidates = np.zeros((len(boundary_f0_list), number_of_frames), dtype=np.float32)
    for i in range(len(boundary_f0_list)):
    raw_f0_candidates[i, :] = harvest_get_f0_candidate_from_raw_event(
    boundary_f0_list[i], actual_fs, y_spectrum, y_length,
    temporal_positions, f0_floor, f0_ceil)
    return raw_f0_candidates

    def harvest_nuttall(N):
    t = np.arange(0, N) * 2 * np.pi / (N - 1)
    coefs = np.array([0.355768, -0.487396, 0.144232, -0.012604])
    window = np.cos(t[:, None].dot(np.array([0., 1., 2., 3.])[None])).dot( coefs[:, None])
    # 1D window...
    return window.ravel()

    def harvest_get_f0_candidate_from_raw_event(boundary_f0,
    fs, y_spectrum, y_length, temporal_positions, f0_floor,
    f0_ceil):
    filter_length_half = int(np.round(fs / boundary_f0 * 2))
    band_pass_filter_base = harvest_nuttall(filter_length_half * 2 + 1)
    shifter = np.cos(2 * np.pi * boundary_f0 * np.arange(-filter_length_half, filter_length_half + 1) / float(fs))
    band_pass_filter = band_pass_filter_base * shifter

    index_bias = filter_length_half
    # numerical issues if 32 bit?
    spectrum_low_pass_filter = np.fft.fft(band_pass_filter.astype("float64"), len(y_spectrum))
    filtered_signal = np.real(np.fft.ifft(spectrum_low_pass_filter * y_spectrum))
    index_bias = filter_length_half + 1
    filtered_signal = filtered_signal[index_bias + np.arange(y_length).astype("int32")]
    negative_zero_cross = harvest_zero_crossing_engine(filtered_signal, fs)
    positive_zero_cross = harvest_zero_crossing_engine(-filtered_signal, fs)
    d_filtered_signal = filtered_signal[1:] - filtered_signal[:-1]
    peak = harvest_zero_crossing_engine(d_filtered_signal, fs)
    dip = harvest_zero_crossing_engine(-d_filtered_signal, fs)
    f0_candidate = harvest_get_f0_candidate_contour(negative_zero_cross,
    positive_zero_cross, peak, dip, temporal_positions)
    from IPython import embed; embed()
    raise ValueError()

    def harvest_get_f0_candidate_contour(negative_zero_cross_tup, positive_zero_cross_tup, peak_tup, dip_tup, temporal_positions):
    # 0 is inteval locations
    # 1 is interval based f0
    usable_channel = max(0, len(negative_zero_cross_tup[0]) - 2)
    usable_channel *= max(0, len(positive_zero_cross_tup[0]) - 2)
    usable_channel *= max(0, len(peak_tup[0]) - 2)
    usable_channel *= max(0, len(dip_tup[0]) - 2)
    if usable_channel > 0:
    interpolated_f0_list = np.zeros((4, len(temporal_positions)))
    nz = interp1d(negative_zero_cross_tup[0], negative_zero_cross_tup[1],
    kind="linear", bounds_error=False, fill_value="extrapolate")
    pz = interp1d(positive_zero_cross_tup[0], positive_zero_cross_tup[1],
    kind="linear", bounds_error=False, fill_value="extrapolate")
    pkz = interp1d(peak_tup[0], peak_tup[1],
    kind="linear", bounds_error=False, fill_value="extrapolate")
    dz = interp1d(dip_tup[0], dip_tup[1],
    kind="linear", bounds_error=False, fill_value="extrapolate")
    interpolated_f0_list[0, :] = nz(temporal_positions)
    interpolated_f0_list[1, :] = pz(temporal_positions)
    interpolated_f0_list[2, :] = pkz(temporal_positions)
    interpolated_f0_list[3, :] = dz(temporal_positions)
    f0_candidate = np.mean(interpolated_f0_list, axis=0)
    else:
    f0_candidate = temporal_positions * 0
    return f0_candidate


    def harvest_zero_crossing_engine(x, fs, debug=False):
    # negative zero crossing, going from positive to negative
    x_shift = x.copy()
    x_shift[:-1] = x_shift[1:]
    x_shift[-1] = x[-1]
    # +1 here to avoid edge case at 0
    points = np.arange(len(x)) + 1
    negative_going_points = points * ((x_shift * x < 0) * (x_shift < x))
    edge_list = negative_going_points[negative_going_points > 0]
    # -1 to correct index
    fine_edge_list = edge_list - x[edge_list - 1] / (x[edge_list] - x[edge_list - 1]).astype("float32")
    interval_locations = (fine_edge_list[:-1] + fine_edge_list[1:]) / float(2) / fs
    interval_based_f0 = float(fs) / (fine_edge_list[1:] - fine_edge_list[:-1])
    return interval_locations, interval_based_f0

    import matplotlib.pyplot as plt

    def harvest(x, fs):
    f0_floor = 71
    f0_ceil = 800
    frame_period = 5
    basic_frame_period = 1
    target_fs = 8000
    basic_temporal_positions = np.arange(0, len(x) / float(fs), basic_frame_period / float(1000))
    channels_in_octave = 40.
    adjusted_f0_floor = f0_floor * 0.9
    adjusted_f0_ceil = f0_ceil * 1.1
    boundary_f0_list = np.arange(1, np.ceil(np.log2(adjusted_f0_ceil / adjusted_f0_floor) * channels_in_octave) + 1) / float(channels_in_octave)
    boundary_f0_list = adjusted_f0_floor * 2.0 ** boundary_f0_list
    y, actual_fs = harvest_get_downsampled_signal(x, fs, target_fs)
    fft_size = 2. ** np.ceil(np.log2(len(y) + np.round(fs / f0_floor * 4) + 1))
    y_spectrum = np.fft.fft(y, 8192)
    raw_f0_candidates = harvest_get_raw_f0_candidates(
    len(basic_temporal_positions),
    boundary_f0_list, len(y), basic_temporal_positions, actual_fs,
    y_spectrum, f0_floor, f0_ceil)

    # raw_f0_candidates = GetRawF0Candidates(...
    # length(basic_temporal_positions), boundary_f0_list, length(y),...
    # basic_temporal_positions, actual_fs, y_spectrum, f0_floor, f0_ceil);
    from IPython import embed; embed()
    raise ValueError()


    def run_world_example():
    fs, d = wavfile.read("vaiueo2d.wav")
    d = d.astype("float32") / 2 ** 15
    harvest(d, fs)


    if __name__ == "__main__":
    run_world_example()
    """
    Trying to run all examples will seg fault on my laptop - probably memory!
    Comment individually
    """
    run_phase_reconstruction_example()
    """
    run_phase_vq_example()
    run_dct_vq_example()
    run_fft_vq_example()
  22. kastnerkyle revised this gist Jan 25, 2017. 1 changed file with 52 additions and 36 deletions.
    88 changes: 52 additions & 36 deletions audio_tools.py
    Original file line number Diff line number Diff line change
    @@ -1,16 +1,20 @@
    # License: BSD 3-clause
    # Authors: Kyle Kastner

    from __future__ import division
    import numpy as np
    """
    # TODO: Is this fixed or hosed
    # Do numpy version check for <= 1.9 ! Something crazy going on in 1.10
    if int(np.__version__.split(".")[1]) >= 10:
    print("Only numpy <= 1.9 currently supported! There is a "
    "numerical error in one of the numpy 1.10 routines. "
    "Hopefully this will be debugged an corrected soon. "
    "For the intrepid, the error can be seen by running"
    "run_phase_reconstruction()")
    """
    from numpy.lib.stride_tricks import as_strided
    import scipy.signal as sg
    import wave
    from scipy.cluster.vq import vq
    from scipy import linalg, fftpack
    from numpy.testing import assert_almost_equal
    @@ -175,7 +179,7 @@ def stft(X, fftsize=128, step="half", mean_normalize=True, real=False,
    local_fft = fftpack.fft
    cut = None
    if compute_onesided:
    cut = fftsize // 2
    cut = fftsize // 2 + 1
    if mean_normalize:
    X -= X.mean()
    if step == "half":
    @@ -203,8 +207,8 @@ def istft(X, fftsize=128, step="half", wsola=False, mean_normalize=True,
    local_ifft = fftpack.ifft
    if compute_onesided:
    X_pad = np.zeros((X.shape[0], 2 * X.shape[1])) + 0j
    X_pad[:, :fftsize // 2] = X
    X_pad[:, fftsize // 2:] = 0
    X_pad[:, :fftsize // 2 + 1] = X
    X_pad[:, fftsize // 2 + 1:] = 0
    X = X_pad
    X = local_ifft(X).astype("float64")
    if step == "half":
    @@ -763,7 +767,7 @@ def lpc_analysis(X, order=8, window_step=128, window_size=2 * 128,
    X = X[None]

    n_points = X.shape[1]
    n_windows = n_points // window_step
    n_windows = int(n_points // window_step)
    if not truncate:
    pad_sizes = [(window_size - window_step) // 2,
    window_size - window_step // 2]
    @@ -783,20 +787,21 @@ def lpc_analysis(X, order=8, window_step=128, window_size=2 * 128,
    # stride_tricks.as_strided?
    autocorr_X = np.zeros((n_windows, 2 * window_size - 1))
    for window in range(max(n_windows - 1, 1)):
    XX = X.ravel()[window * window_step + np.arange(window_size)]
    wtws = int(window * window_step)
    XX = X.ravel()[wtws + np.arange(window_size, dtype="int32")]
    WXX = XX * sg.hanning(window_size)
    autocorr_X[window] = np.correlate(WXX, WXX, mode='full')
    center = np.argmax(autocorr_X[window])
    RXX = autocorr_X[window,
    np.arange(center, window_size + order)]
    np.arange(center, window_size + order, dtype="int32")]
    R = linalg.toeplitz(RXX[:-1])
    solved_R = linalg.pinv(R).dot(RXX[1:])
    filter_coefs = np.hstack((1, -solved_R))
    residual_signal = sg.lfilter(filter_coefs, 1, WXX)
    gain = np.sqrt(np.mean(residual_signal ** 2))
    lp_coefficients[window] = filter_coefs
    per_frame_gain[window] = gain
    assign_range = window * window_step + np.arange(window_size)
    assign_range = wtws + np.arange(window_size, dtype="int32")
    residual_excitation[assign_range] += residual_signal / gain
    # Throw away first part in overlap mode for proper synthesis
    residual_excitation = residual_excitation[pad_sizes[0]:]
    @@ -1651,7 +1656,7 @@ def time_attack_agc(X, fs, t_scale=0.5, f_scale=1.):
    n_bands = max(10, 20 / f_scale)
    mel_width = f_scale * n_bands / 10.
    f_to_a = mel_freq_weights(n_fft, fs, n_bands, mel_width)
    f_to_a = f_to_a[:, :n_fft // 2]
    f_to_a = f_to_a[:, :n_fft // 2 + 1]
    audiogram = np.abs(X_freq).dot(f_to_a.T)
    fbg = np.zeros_like(audiogram)
    state = np.zeros((audiogram.shape[1],))
    @@ -1926,23 +1931,28 @@ def iterate_invert_spectrogram(X_s, fftsize, step, n_iter=10, verbose=False,
    """
    reg = np.max(X_s) / 1E8
    X_best = copy.deepcopy(X_s)
    for i in range(n_iter):
    if verbose:
    print("Runnning iter %i" % i)
    if i == 0 and not complex_input:
    X_t = invert_spectrogram(X_best, step, calculate_offset=True,
    set_zero_phase=True)
    else:
    # Calculate offset was False in the MATLAB version
    # but in mine it massively improves the result
    # Possible bug in my impl?
    X_t = invert_spectrogram(X_best, step, calculate_offset=True,
    set_zero_phase=False)
    est = stft(X_t, fftsize=fftsize, step=step, compute_onesided=False)
    phase = est / np.maximum(reg, np.abs(est))
    phase = phase[:len(X_s)]
    X_s = X_s[:len(phase)]
    X_best = X_s * phase
    try:
    for i in range(n_iter):
    if verbose:
    print("Runnning iter %i" % i)
    if i == 0 and not complex_input:
    X_t = invert_spectrogram(X_best, step, calculate_offset=True,
    set_zero_phase=True)
    else:
    # Calculate offset was False in the MATLAB version
    # but in mine it massively improves the result
    # Possible bug in my impl?
    X_t = invert_spectrogram(X_best, step, calculate_offset=True,
    set_zero_phase=False)
    est = stft(X_t, fftsize=fftsize, step=step, compute_onesided=False)
    phase = est / np.maximum(reg, np.abs(est))
    phase = phase[:len(X_s)]
    X_s = X_s[:len(phase)]
    X_best = X_s * phase
    except ValueError:
    raise ValueError("The iterate_invert_spectrogram algorithm requires"
    " stft(..., compute_onesided=False),",
    " be sure you have calculated stft with this argument")
    X_t = invert_spectrogram(X_best, step, calculate_offset=True,
    set_zero_phase=False)
    return np.real(X_t)
    @@ -2144,8 +2154,8 @@ def apply_preprocess(list_of_data, clusters):
    vq_r = clusters[memberships]
    vq_r = vq_r.reshape((time_smoothing * len(vq_r),
    vq_r.shape[1] // time_smoothing))
    f_mag = vq_r[:, :n_fft // 2]
    f_sincos = vq_r[:, n_fft // 2:]
    f_mag = vq_r[:, :n_fft // 2 + 1]
    f_sincos = vq_r[:, n_fft // 2 + 1:]
    extent = f_sincos.shape[1] // 2
    f_phs = sin_cos_to_angle(f_sincos[:, :extent], f_sincos[:, extent:])
    vq_c = abs_and_angle_to_complex(f_mag, f_phs)
    @@ -2348,7 +2358,11 @@ def apply_preprocess(list_of_data, clusters):


    def run_cqt_example():
    fs, d = fetch_sample_file("/Users/User/cqt_resources/kempff1.wav")
    try:
    fs, d = fetch_sample_file("/Users/User/cqt_resources/kempff1.wav")
    except ValueError:
    print("WARNING: Using sample music instead byt kempff1.wav is the example")
    fs, d = fetch_sample_music()
    X = d[:44100]
    X_cq, c_dc, c_nyq, multiscale, shift, window_lens = cqt(X, fs)
    X_r = icqt(X_cq, c_dc, c_nyq, multiscale, shift, window_lens)
    @@ -2388,10 +2402,12 @@ def run_fft_dct_example():
    Comment individually
    """
    run_phase_reconstruction_example()
    #run_phase_vq_example()
    #run_dct_vq_example()
    #run_fft_vq_example()
    #run_lpc_example()
    #run_cqt_example()
    #run_fft_dct_example()
    #test_all()
    """
    run_phase_vq_example()
    run_dct_vq_example()
    run_fft_vq_example()
    run_lpc_example()
    run_cqt_example()
    run_fft_dct_example()
    test_all()
    """
  23. kastnerkyle revised this gist Jan 25, 2017. 1 changed file with 296 additions and 183 deletions.
    479 changes: 296 additions & 183 deletions audio_tools.py
    Original file line number Diff line number Diff line change
    @@ -169,10 +169,10 @@ def stft(X, fftsize=128, step="half", mean_normalize=True, real=False,
    Compute STFT for 1D real valued input X
    """
    if real:
    local_fft = np.fft.rfft
    local_fft = fftpack.rfft
    cut = -1
    else:
    local_fft = np.fft.fft
    local_fft = fftpack.fft
    cut = None
    if compute_onesided:
    cut = fftsize // 2
    @@ -189,25 +189,28 @@ def stft(X, fftsize=128, step="half", mean_normalize=True, real=False,
    return X


    def istft(X, fftsize=128, mean_normalize=True, real=False,
    compute_onesided=True):
    def istft(X, fftsize=128, step="half", wsola=False, mean_normalize=True,
    real=False, compute_onesided=True):
    """
    Compute ISTFT for STFT transformed X
    """
    if real:
    local_ifft = np.fft.irfft
    local_ifft = fftpack.irfft
    X_pad = np.zeros((X.shape[0], X.shape[1] + 1)) + 0j
    X_pad[:, :-1] = X
    X = X_pad
    else:
    local_ifft = np.fft.ifft
    local_ifft = fftpack.ifft
    if compute_onesided:
    X_pad = np.zeros((X.shape[0], 2 * X.shape[1])) + 0j
    X_pad[:, :fftsize // 2] = X
    X_pad[:, fftsize // 2:] = 0
    X = X_pad
    X = local_ifft(X).astype("float64")
    X = invert_halfoverlap(X)
    if step == "half":
    X = invert_halfoverlap(X)
    else:
    X = overlap_add(X, step, wsola=wsola)
    if mean_normalize:
    X -= np.mean(X)
    return X
    @@ -245,17 +248,17 @@ def nsgcwin(fmin, fmax, n_bins, fs, signal_len, gamma):
    References
    ----------
    Velasco G. A., Holighaus N., Dörfler M., Grill T.
    Velasco G. A., Holighaus N., Dorfler M., Grill T.
    Constructing an invertible constant-Q transform with nonstationary Gabor
    frames, Proceedings of the 14th International Conference on Digital
    Audio Effects (DAFx 11), Paris, France, 2011
    Holighaus N., Dörfler M., Velasco G. A. and Grill T.
    Holighaus N., Dorfler M., Velasco G. A. and Grill T.
    A framework for invertible, real-time constant-Q transforms, submitted.
    Original matlab code copyright follows:
    AUTHOR(s) : Monika Dörfler, Gino Angelo Velasco, Nicki Holighaus, 2010-2011
    AUTHOR(s) : Monika Dorfler, Gino Angelo Velasco, Nicki Holighaus, 2010-2011
    COPYRIGHT : (c) NUHAG, Dept.Math., University of Vienna, AUSTRIA
    http://nuhag.eu/
    @@ -364,17 +367,17 @@ def nsgtf_real(X, multiscale, shift, window_lens):
    References
    ----------
    Velasco G. A., Holighaus N., Dörfler M., Grill T.
    Velasco G. A., Holighaus N., Dorfler M., Grill T.
    Constructing an invertible constant-Q transform with nonstationary Gabor
    frames, Proceedings of the 14th International Conference on Digital
    Audio Effects (DAFx 11), Paris, France, 2011
    Holighaus N., Dörfler M., Velasco G. A. and Grill T.
    Holighaus N., Dorfler M., Velasco G. A. and Grill T.
    A framework for invertible, real-time constant-Q transforms, submitted.
    Original matlab code copyright follows:
    AUTHOR(s) : Monika Dörfler, Gino Angelo Velasco, Nicki Holighaus, 2010-2011
    AUTHOR(s) : Monika Dorfler, Gino Angelo Velasco, Nicki Holighaus, 2010-2011
    COPYRIGHT : (c) NUHAG, Dept.Math., University of Vienna, AUSTRIA
    http://nuhag.eu/
    @@ -435,17 +438,17 @@ def nsdual(multiscale, shift, window_lens):
    References
    ----------
    Velasco G. A., Holighaus N., Dörfler M., Grill T.
    Velasco G. A., Holighaus N., Dorfler M., Grill T.
    Constructing an invertible constant-Q transform with nonstationary Gabor
    frames, Proceedings of the 14th International Conference on Digital
    Audio Effects (DAFx 11), Paris, France, 2011
    Holighaus N., Dörfler M., Velasco G. A. and Grill T.
    Holighaus N., Dorfler M., Velasco G. A. and Grill T.
    A framework for invertible, real-time constant-Q transforms, submitted.
    Original matlab code copyright follows:
    AUTHOR(s) : Monika Dörfler, Gino Angelo Velasco, Nicki Holighaus, 2010-2011
    AUTHOR(s) : Monika Dorfler, Gino Angelo Velasco, Nicki Holighaus, 2010-2011
    COPYRIGHT : (c) NUHAG, Dept.Math., University of Vienna, AUSTRIA
    http://nuhag.eu/
    @@ -483,17 +486,17 @@ def nsgitf_real(c, c_dc, c_nyq, multiscale, shift):
    References
    ----------
    Velasco G. A., Holighaus N., Dörfler M., Grill T.
    Velasco G. A., Holighaus N., Dorfler M., Grill T.
    Constructing an invertible constant-Q transform with nonstationary Gabor
    frames, Proceedings of the 14th International Conference on Digital
    Audio Effects (DAFx 11), Paris, France, 2011
    Holighaus N., Dörfler M., Velasco G. A. and Grill T.
    Holighaus N., Dorfler M., Velasco G. A. and Grill T.
    A framework for invertible, real-time constant-Q transforms, submitted.
    Original matlab code copyright follows:
    AUTHOR(s) : Monika Dörfler, Gino Angelo Velasco, Nicki Holighaus, 2010-2011
    AUTHOR(s) : Monika Dorfler, Gino Angelo Velasco, Nicki Holighaus, 2010-2011
    COPYRIGHT : (c) NUHAG, Dept.Math., University of Vienna, AUSTRIA
    http://nuhag.eu/
    @@ -546,17 +549,17 @@ def cqt(X, fs, n_bins=48, fmin=27.5, fmax="nyq", gamma=20):
    References
    ----------
    Velasco G. A., Holighaus N., Dörfler M., Grill T.
    Velasco G. A., Holighaus N., Dorfler M., Grill T.
    Constructing an invertible constant-Q transform with nonstationary Gabor
    frames, Proceedings of the 14th International Conference on Digital
    Audio Effects (DAFx 11), Paris, France, 2011
    Holighaus N., Dörfler M., Velasco G. A. and Grill T.
    Holighaus N., Dorfler M., Velasco G. A. and Grill T.
    A framework for invertible, real-time constant-Q transforms, submitted.
    Original matlab code copyright follows:
    AUTHOR(s) : Monika Dörfler, Gino Angelo Velasco, Nicki Holighaus, 2010-2011
    AUTHOR(s) : Monika Dorfler, Gino Angelo Velasco, Nicki Holighaus, 2010-2011
    COPYRIGHT : (c) NUHAG, Dept.Math., University of Vienna, AUSTRIA
    http://nuhag.eu/
    @@ -591,17 +594,17 @@ def icqt(X_cq, c_dc, c_nyq, multiscale, shift, window_lens):
    References
    ----------
    Velasco G. A., Holighaus N., Dörfler M., Grill T.
    Velasco G. A., Holighaus N., Dorfler M., Grill T.
    Constructing an invertible constant-Q transform with nonstationary Gabor
    frames, Proceedings of the 14th International Conference on Digital
    Audio Effects (DAFx 11), Paris, France, 2011
    Holighaus N., Dörfler M., Velasco G. A. and Grill T.
    Holighaus N., Dorfler M., Velasco G. A. and Grill T.
    A framework for invertible, real-time constant-Q transforms, submitted.
    Original matlab code copyright follows:
    AUTHOR(s) : Monika Dörfler, Gino Angelo Velasco, Nicki Holighaus, 2010-2011
    AUTHOR(s) : Monika Dorfler, Gino Angelo Velasco, Nicki Holighaus, 2010-2011
    COPYRIGHT : (c) NUHAG, Dept.Math., University of Vienna, AUSTRIA
    http://nuhag.eu/
    @@ -678,7 +681,10 @@ def voiced_unvoiced(X, window_size=256, window_step=128, copy=True):
    strength_factor = .3
    start = np.where(right_XX_corr < .3 * XX_corr[center])[0]
    # 20 is hardcoded but should depend on samplerate?
    start = np.max([20, start[0]])
    try:
    start = np.max([20, start[0]])
    except IndexError:
    start = 20
    search_corr = right_XX_corr[start:]
    index = np.argmax(search_corr)
    second_max = search_corr[index]
    @@ -700,7 +706,7 @@ def lpc_analysis(X, order=8, window_step=128, window_size=2 * 128,
    Based on code from:
    http://labrosa.ee.columbia.edu/matlab/sws/
    Parameters
    _rParameters
    ----------
    X : ndarray
    Signals to extract LPC coefficients from
    @@ -797,6 +803,112 @@ def lpc_analysis(X, order=8, window_step=128, window_size=2 * 128,
    return lp_coefficients, per_frame_gain, residual_excitation


    def lpc_to_frequency(lp_coefficients, per_frame_gain):
    """
    Extract resonant frequencies and magnitudes from LPC coefficients and gains.
    Parameters
    ----------
    lp_coefficients : ndarray
    LPC coefficients, such as those calculated by ``lpc_analysis``
    per_frame_gain : ndarray
    Gain calculated for each frame, such as those calculated
    by ``lpc_analysis``
    Returns
    -------
    frequencies : ndarray
    Resonant frequencies calculated from LPC coefficients and gain. Returned
    frequencies are from 0 to 2 * pi
    magnitudes : ndarray
    Magnitudes of resonant frequencies
    References
    ----------
    D. P. W. Ellis (2004), "Sinewave Speech Analysis/Synthesis in Matlab",
    Web resource, available: http://www.ee.columbia.edu/ln/labrosa/matlab/sws/
    """
    n_windows, order = lp_coefficients.shape

    frame_frequencies = np.zeros((n_windows, (order - 1) // 2))
    frame_magnitudes = np.zeros_like(frame_frequencies)

    for window in range(n_windows):
    w_coefs = lp_coefficients[window]
    g_coefs = per_frame_gain[window]
    roots = np.roots(np.hstack(([1], w_coefs[1:])))
    # Roots doesn't return the same thing as MATLAB... agh
    frequencies, index = np.unique(
    np.abs(np.angle(roots)), return_index=True)
    # Make sure 0 doesn't show up...
    gtz = np.where(frequencies > 0)[0]
    frequencies = frequencies[gtz]
    index = index[gtz]
    magnitudes = g_coefs / (1. - np.abs(roots))
    sort_index = np.argsort(frequencies)
    frame_frequencies[window, :len(sort_index)] = frequencies[sort_index]
    frame_magnitudes[window, :len(sort_index)] = magnitudes[sort_index]
    return frame_frequencies, frame_magnitudes


    def lpc_to_lsf(all_lpc):
    if len(all_lpc.shape) < 2:
    all_lpc = all_lpc[None]
    order = all_lpc.shape[1] - 1
    all_lsf = np.zeros((len(all_lpc), order))
    for i in range(len(all_lpc)):
    lpc = all_lpc[i]
    lpc1 = np.append(lpc, 0)
    lpc2 = lpc1[::-1]
    sum_filt = lpc1 + lpc2
    diff_filt = lpc1 - lpc2

    if order % 2 != 0:
    deconv_diff, _ = sg.deconvolve(diff_filt, [1, 0, -1])
    deconv_sum = sum_filt
    else:
    deconv_diff, _ = sg.deconvolve(diff_filt, [1, -1])
    deconv_sum, _ = sg.deconvolve(sum_filt, [1, 1])

    roots_diff = np.roots(deconv_diff)
    roots_sum = np.roots(deconv_sum)
    angle_diff = np.angle(roots_diff[::2])
    angle_sum = np.angle(roots_sum[::2])
    lsf = np.sort(np.hstack((angle_diff, angle_sum)))
    if len(lsf) != 0:
    all_lsf[i] = lsf
    return np.squeeze(all_lsf)


    def lsf_to_lpc(all_lsf):
    if len(all_lsf.shape) < 2:
    all_lsf = all_lsf[None]
    order = all_lsf.shape[1]
    all_lpc = np.zeros((len(all_lsf), order + 1))
    for i in range(len(all_lsf)):
    lsf = all_lsf[i]
    zeros = np.exp(1j * lsf)
    sum_zeros = zeros[::2]
    diff_zeros = zeros[1::2]
    sum_zeros = np.hstack((sum_zeros, np.conj(sum_zeros)))
    diff_zeros = np.hstack((diff_zeros, np.conj(diff_zeros)))
    sum_filt = np.poly(sum_zeros)
    diff_filt = np.poly(diff_zeros)

    if order % 2 != 0:
    deconv_diff = sg.convolve(diff_filt, [1, 0, -1])
    deconv_sum = sum_filt
    else:
    deconv_diff = sg.convolve(diff_filt, [1, -1])
    deconv_sum = sg.convolve(sum_filt, [1, 1])

    lpc = .5 * (deconv_sum + deconv_diff)
    # Last coefficient is 0 and not returned
    all_lpc[i] = lpc[:-1]
    return np.squeeze(all_lpc)


    def lpc_synthesis(lp_coefficients, per_frame_gain, residual_excitation=None,
    voiced_frames=None, window_step=128, emphasis=0.9):
    """
    @@ -854,7 +966,7 @@ def lpc_synthesis(lp_coefficients, per_frame_gain, residual_excitation=None,
    if residual_excitation is None:
    # Need to generate excitation
    if voiced_frames is None:
    # No voiced/unvoiced info, so just use randn
    # No voiced/unvoiced info
    voiced_frames = np.ones((lp_coefficients.shape[0], 1))
    residual_excitation = np.zeros((n_excitation_points))
    f, m = lpc_to_frequency(lp_coefficients, per_frame_gain)
    @@ -899,7 +1011,7 @@ def lpc_synthesis(lp_coefficients, per_frame_gain, residual_excitation=None,
    return synthesized


    def soundsc(X, copy=True):
    def soundsc(X, gain_scale=.9, copy=True):
    """
    Approximate implementation of soundsc from MATLAB without the audio playing.
    @@ -908,69 +1020,112 @@ def soundsc(X, copy=True):
    X : ndarray
    Signal to be rescaled
    gain_scale : float
    Gain multipler, default .9 (90% of maximum representation)
    copy : bool, optional (default=True)
    Whether to make a copy of input signal or operate in place.
    Returns
    -------
    X_sc : ndarray
    (-1, 1) scaled version of X as float32, suitable for writing
    (-32767, 32767) scaled version of X as int16, suitable for writing
    with scipy.io.wavfile
    """
    X = np.array(X, copy=copy)
    X = (X - X.min()) / (X.max() - X.min())
    X = .9 * X
    X = 2 * X - 1
    return X.astype('float32')
    X = gain_scale * X
    X = X * 2 ** 15
    return X.astype('int16')


    def _wav2array(nchannels, sampwidth, data):
    # wavio.py
    # Author: Warren Weckesser
    # License: BSD 3-Clause (http://opensource.org/licenses/BSD-3-Clause)

    """data must be the string containing the bytes from the wav file."""
    num_samples, remainder = divmod(len(data), sampwidth * nchannels)
    if remainder > 0:
    raise ValueError('The length of data is not a multiple of '
    'sampwidth * num_channels.')
    if sampwidth > 4:
    raise ValueError("sampwidth must not be greater than 4.")

    if sampwidth == 3:
    a = np.empty((num_samples, nchannels, 4), dtype=np.uint8)
    raw_bytes = np.fromstring(data, dtype=np.uint8)
    a[:, :, :sampwidth] = raw_bytes.reshape(-1, nchannels, sampwidth)
    a[:, :, sampwidth:] = (a[:, :, sampwidth - 1:sampwidth] >> 7) * 255
    result = a.view('<i4').reshape(a.shape[:-1])
    else:
    # 8 bit samples are stored as unsigned ints; others as signed ints.
    dt_char = 'u' if sampwidth == 1 else 'i'
    a = np.fromstring(data, dtype='<%s%d' % (dt_char, sampwidth))
    result = a.reshape(-1, nchannels)
    return result


    def lpc_to_frequency(lp_coefficients, per_frame_gain):
    def readwav(file):
    # wavio.py
    # Author: Warren Weckesser
    # License: BSD 3-Clause (http://opensource.org/licenses/BSD-3-Clause)
    """
    Extract resonant frequencies and magnitudes from LPC coefficients and gains.
    Parameters
    ----------
    lp_coefficients : ndarray
    LPC coefficients, such as those calculated by ``lpc_analysis``
    Read a wav file.
    per_frame_gain : ndarray
    Gain calculated for each frame, such as those calculated
    by ``lpc_analysis``
    Returns the frame rate, sample width (in bytes) and a numpy array
    containing the data.
    Returns
    -------
    frequencies : ndarray
    Resonant frequencies calculated from LPC coefficients and gain. Returned
    frequencies are from 0 to 2 * pi
    This function does not read compressed wav files.
    """
    wav = wave.open(file)
    rate = wav.getframerate()
    nchannels = wav.getnchannels()
    sampwidth = wav.getsampwidth()
    nframes = wav.getnframes()
    data = wav.readframes(nframes)
    wav.close()
    array = _wav2array(nchannels, sampwidth, data)
    return rate, sampwidth, array

    magnitudes : ndarray
    Magnitudes of resonant frequencies

    References
    ----------
    D. P. W. Ellis (2004), "Sinewave Speech Analysis/Synthesis in Matlab",
    Web resource, available: http://www.ee.columbia.edu/ln/labrosa/matlab/sws/
    def csvd(arr):
    """
    n_windows, order = lp_coefficients.shape
    Do the complex SVD of a 2D array, returning real valued U, S, VT
    frame_frequencies = np.zeros((n_windows, (order - 1) // 2))
    frame_magnitudes = np.zeros_like(frame_frequencies)
    http://stemblab.github.io/complex-svd/
    """
    C_r = arr.real
    C_i = arr.imag
    block_x = C_r.shape[0]
    block_y = C_r.shape[1]
    K = np.zeros((2 * block_x, 2 * block_y))
    # Upper left
    K[:block_x, :block_y] = C_r
    # Lower left
    K[:block_x, block_y:] = C_i
    # Upper right
    K[block_x:, :block_y] = -C_i
    # Lower right
    K[block_x:, block_y:] = C_r
    return svd(K, full_matrices=False)

    for window in range(n_windows):
    w_coefs = lp_coefficients[window]
    g_coefs = per_frame_gain[window]
    roots = np.roots(np.hstack(([1], w_coefs[1:])))
    # Roots doesn't return the same thing as MATLAB... agh
    frequencies, index = np.unique(
    np.abs(np.angle(roots)), return_index=True)
    # Make sure 0 doesn't show up...
    gtz = np.where(frequencies > 0)[0]
    frequencies = frequencies[gtz]
    index = index[gtz]
    magnitudes = g_coefs / (1. - np.abs(roots))
    sort_index = np.argsort(frequencies)
    frame_frequencies[window, :len(sort_index)] = frequencies[sort_index]
    frame_magnitudes[window, :len(sort_index)] = magnitudes[sort_index]
    return frame_frequencies, frame_magnitudes

    def icsvd(U, S, VT):
    """
    Invert back to complex values from the output of csvd
    U, S, VT = csvd(X)
    X_rec = inv_csvd(U, S, VT)
    """
    K = U.dot(np.diag(S)).dot(VT)
    block_x = U.shape[0] // 2
    block_y = U.shape[1] // 2
    arr_rec = np.zeros((block_x, block_y)) + 0j
    arr_rec.real = K[:block_x, :block_y]
    arr_rec.imag = K[:block_x, block_y:]
    return arr_rec


    def sinusoid_analysis(X, input_sample_rate, resample_block=128, copy=True):
    @@ -1221,11 +1376,10 @@ def overlap(X, window_size, window_step):
    # Make sure there are an even number of windows before stridetricks
    append = np.zeros((window_size - len(X) % window_size))
    X = np.hstack((X, append))
    num_frames = len(X) // window_step - 1
    row_stride = X.itemsize * window_step
    col_stride = X.itemsize
    X_strided = as_strided(X, shape=(num_frames, window_size),
    strides=(row_stride, col_stride))
    overlap_sz = window_size - window_step
    new_shape = X.shape[:-1] + ((X.shape[-1] - overlap_sz) // window_step, window_size)
    new_strides = X.strides[:-1] + (window_step * X.strides[-1],) + X.strides[-1:]
    X_strided = as_strided(X, shape=new_shape, strides=new_strides)
    return X_strided


    @@ -1287,42 +1441,55 @@ def invert_halfoverlap(X_strided):
    return X


    def csvd(arr):
    def overlap_add(X_strided, window_step, wsola=False):
    """
    Do the complex SVD of a 2D array, returning real valued U, S, VT
    overlap add to reconstruct X
    http://stemblab.github.io/complex-svd/
    """
    C_r = arr.real
    C_i = arr.imag
    block_x = C_r.shape[0]
    block_y = C_r.shape[1]
    K = np.zeros((2 * block_x, 2 * block_y))
    # Upper left
    K[:block_x, :block_y] = C_r
    # Lower left
    K[:block_x, block_y:] = C_i
    # Upper right
    K[block_x:, :block_y] = -C_i
    # Lower right
    K[block_x:, block_y:] = C_r
    return svd(K, full_matrices=False)
    Parameters
    ----------
    X_strided : ndarray, shape=(n_windows, window_size)
    X as overlapped windows
    window_step : int
    step size for overlap add
    def icsvd(U, S, VT):
    Returns
    -------
    X : ndarray, shape=(n_samples,)
    Reconstructed version of X
    """
    Invert back to complex values from the output of csvd
    n_rows, window_size = X_strided.shape

    U, S, VT = csvd(X)
    X_rec = inv_csvd(U, S, VT)
    """
    K = U.dot(np.diag(S)).dot(VT)
    block_x = U.shape[0] // 2
    block_y = U.shape[1] // 2
    arr_rec = np.zeros((block_x, block_y)) + 0j
    arr_rec.real = K[:block_x, :block_y]
    arr_rec.imag = K[:block_x, block_y:]
    return arr_rec
    # Start with largest size (no overlap) then truncate after we finish
    # +2 for one window on each side
    X = np.zeros(((n_rows + 2) * window_size,)).astype(X_strided.dtype)
    start_index = 0

    total_windowing_sum = np.zeros((X.shape[0]))
    win = 0.54 - .46 * np.cos(2 * np.pi * np.arange(window_size) / (
    window_size - 1))
    for i in range(n_rows):
    end_index = start_index + window_size
    if wsola:
    offset_size = window_size - window_step
    offset = xcorr_offset(X[start_index:start_index + offset_size],
    X_strided[i, :offset_size])
    ss = start_index - offset
    st = end_index - offset
    if start_index - offset < 0:
    ss = 0
    st = 0 + (end_index - start_index)
    X[ss:st] += X_strided[i]
    total_windowing_sum[ss:st] += win
    start_index = start_index + window_step
    else:
    X[start_index:end_index] += X_strided[i]
    total_windowing_sum[start_index:end_index] += win
    start_index += window_step
    # Not using this right now
    #X = np.real(X) / (total_windowing_sum + 1)
    X = X[:end_index]
    return X


    def overlap_compress(X, n_components, window_size):
    @@ -1379,63 +1546,6 @@ def overlap_uncompress(X_compressed, window_size):
    return invert_halfoverlap(X_r)


    def lpc_to_lsf(all_lpc):
    if len(all_lpc.shape) < 2:
    all_lpc = all_lpc[None]
    order = all_lpc.shape[1] - 1
    all_lsf = np.zeros((len(all_lpc), order))
    for i in range(len(all_lpc)):
    lpc = all_lpc[i]
    lpc1 = np.append(lpc, 0)
    lpc2 = lpc1[::-1]
    sum_filt = lpc1 + lpc2
    diff_filt = lpc1 - lpc2

    if order % 2 != 0:
    deconv_diff, _ = sg.deconvolve(diff_filt, [1, 0, -1])
    deconv_sum = sum_filt
    else:
    deconv_diff, _ = sg.deconvolve(diff_filt, [1, -1])
    deconv_sum, _ = sg.deconvolve(sum_filt, [1, 1])

    roots_diff = np.roots(deconv_diff)
    roots_sum = np.roots(deconv_sum)
    angle_diff = np.angle(roots_diff[::2])
    angle_sum = np.angle(roots_sum[::2])
    lsf = np.sort(np.hstack((angle_diff, angle_sum)))
    if len(lsf) != 0:
    all_lsf[i] = lsf
    return np.squeeze(all_lsf)


    def lsf_to_lpc(all_lsf):
    if len(all_lsf.shape) < 2:
    all_lsf = all_lsf[None]
    order = all_lsf.shape[1]
    all_lpc = np.zeros((len(all_lsf), order + 1))
    for i in range(len(all_lsf)):
    lsf = all_lsf[i]
    zeros = np.exp(1j * lsf)
    sum_zeros = zeros[::2]
    diff_zeros = zeros[1::2]
    sum_zeros = np.hstack((sum_zeros, np.conj(sum_zeros)))
    diff_zeros = np.hstack((diff_zeros, np.conj(diff_zeros)))
    sum_filt = np.poly(sum_zeros)
    diff_filt = np.poly(diff_zeros)

    if order % 2 != 0:
    deconv_diff = sg.convolve(diff_filt, [1, 0, -1])
    deconv_sum = sum_filt
    else:
    deconv_diff = sg.convolve(diff_filt, [1, -1])
    deconv_sum = sg.convolve(sum_filt, [1, 1])

    lpc = .5 * (deconv_sum + deconv_diff)
    # Last coefficient is 0 and not returned
    all_lpc[i] = lpc[:-1]
    return np.squeeze(all_lpc)


    def herz_to_mel(freqs):
    """
    Based on code by Dan Ellis
    @@ -1508,7 +1618,7 @@ def mel_freq_weights(n_fft, fs, n_filts=None, width=None):
    fft_freqs = np.arange(n_fft // 2) / n_fft * fs
    min_mel = herz_to_mel(min_freq)
    max_mel = herz_to_mel(max_freq)
    partial = np.arange(n_filts + 2) / (n_filts + 1) * (max_mel - min_mel)
    partial = np.arange(n_filts + 2) / (n_filts + 1.) * (max_mel - min_mel)
    bin_freqs = mel_to_herz(min_mel + partial)
    bin_bin = np.round(bin_freqs / fs * (n_fft - 1))
    for i in range(n_filts):
    @@ -1622,6 +1732,14 @@ def abs_and_angle_to_complex(arr_abs, arr_angle):
    return arr_abs * np.exp(1j * arr_angle)


    def angle_to_sin_cos(arr_angle):
    return np.hstack((np.sin(arr_angle), np.cos(arr_angle)))


    def sin_cos_to_angle(arr_sin, arr_cos):
    return np.arctan2(arr_sin, arr_cos)


    def polyphase_core(x, m, f):
    # x = input data
    # m = decimation rate
    @@ -1693,14 +1811,6 @@ def unwindow(arr, window_size, window_step=1, axis=0):
    return unwindowed.mean(axis=1)


    def angle_to_sin_cos(arr_angle):
    return np.hstack((np.sin(arr_angle), np.cos(arr_angle)))


    def sin_cos_to_angle(arr_sin, arr_cos):
    return np.arctan2(arr_sin, arr_cos)


    def xcorr_offset(x1, x2):
    """
    Under MSR-LA License
    @@ -1792,7 +1902,8 @@ def invert_spectrogram(X_s, step, calculate_offset=True, set_zero_phase=True):
    return wave


    def iterate_invert_spectrogram(X_s, fftsize, step, n_iter=10, verbose=False):
    def iterate_invert_spectrogram(X_s, fftsize, step, n_iter=10, verbose=False,
    complex_input=False):
    """
    Under MSR-LA License
    @@ -1818,7 +1929,7 @@ def iterate_invert_spectrogram(X_s, fftsize, step, n_iter=10, verbose=False):
    for i in range(n_iter):
    if verbose:
    print("Runnning iter %i" % i)
    if i == 0:
    if i == 0 and not complex_input:
    X_t = invert_spectrogram(X_best, step, calculate_offset=True,
    set_zero_phase=True)
    else:
    @@ -1829,7 +1940,9 @@ def iterate_invert_spectrogram(X_s, fftsize, step, n_iter=10, verbose=False):
    set_zero_phase=False)
    est = stft(X_t, fftsize=fftsize, step=step, compute_onesided=False)
    phase = est / np.maximum(reg, np.abs(est))
    X_best = X_s * phase[:len(X_s)]
    phase = phase[:len(X_s)]
    X_s = X_s[:len(phase)]
    X_best = X_s * phase
    X_t = invert_spectrogram(X_best, step, calculate_offset=True,
    set_zero_phase=False)
    return np.real(X_t)
    @@ -2274,11 +2387,11 @@ def run_fft_dct_example():
    Trying to run all examples will seg fault on my laptop - probably memory!
    Comment individually
    """
    # run_phase_reconstruction_example()
    # run_phase_vq_example()
    # run_dct_vq_example()
    # run_fft_vq_example()
    # run_lpc_example()
    # run_cqt_example()
    run_fft_dct_example()
    test_all()
    run_phase_reconstruction_example()
    #run_phase_vq_example()
    #run_dct_vq_example()
    #run_fft_vq_example()
    #run_lpc_example()
    #run_cqt_example()
    #run_fft_dct_example()
    #test_all()
  24. kastnerkyle revised this gist Mar 13, 2016. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion audio_tools.py
    Original file line number Diff line number Diff line change
    @@ -2248,7 +2248,7 @@ def run_fft_dct_example():
    random_state = np.random.RandomState(1999)

    fs, d = fetch_sample_speech_fruit()
    n_fft = 128
    n_fft = 64
    X = d[0]
    X_stft = stft(X, n_fft)
    X_rr = complex_to_real_view(X_stft)
  25. kastnerkyle revised this gist Mar 13, 2016. 1 changed file with 32 additions and 6 deletions.
    38 changes: 32 additions & 6 deletions audio_tools.py
    Original file line number Diff line number Diff line change
    @@ -4,11 +4,11 @@
    import numpy as np
    # Do numpy version check for <= 1.9 ! Something crazy going on in 1.10
    if int(np.__version__.split(".")[1]) >= 10:
    raise ValueError("Only numpy <= 1.9 currently supported! There is a "
    "numerical error in one of the numpy 1.10 routines. "
    "Hopefully this will be debugged an corrected soon. "
    "For the intrepid, the error can be seen by running"
    "run_phase_reconstruction()")
    print("Only numpy <= 1.9 currently supported! There is a "
    "numerical error in one of the numpy 1.10 routines. "
    "Hopefully this will be debugged an corrected soon. "
    "For the intrepid, the error can be seen by running"
    "run_phase_reconstruction()")
    from numpy.lib.stride_tricks import as_strided
    import scipy.signal as sg
    from scipy.cluster.vq import vq
    @@ -2244,6 +2244,31 @@ def run_cqt_example():
    wavfile.write("cqt_reconstruction.wav", fs, soundsc(X_r))


    def run_fft_dct_example():
    random_state = np.random.RandomState(1999)

    fs, d = fetch_sample_speech_fruit()
    n_fft = 128
    X = d[0]
    X_stft = stft(X, n_fft)
    X_rr = complex_to_real_view(X_stft)
    X_dct = fftpack.dct(X_rr, axis=-1, norm='ortho')
    X_dct_sub = X_dct[1:] - X_dct[:-1]
    std = X_dct_sub.std(axis=0, keepdims=True)
    X_dct_sub += .01 * std * random_state.randn(
    X_dct_sub.shape[0], X_dct_sub.shape[1])
    X_dct_unsub = np.cumsum(X_dct_sub, axis=0)
    X_idct = fftpack.idct(X_dct_unsub, axis=-1, norm='ortho')
    X_irr = real_to_complex_view(X_idct)
    X_r = istft(X_irr, n_fft)[:len(X)]

    SNR = 20 * np.log10(np.linalg.norm(X - X_r) / np.linalg.norm(X))
    print(SNR)

    wavfile.write("fftdct_orig.wav", fs, soundsc(X))
    wavfile.write("fftdct_rec.wav", fs, soundsc(X_r))


    if __name__ == "__main__":
    """
    Trying to run all examples will seg fault on my laptop - probably memory!
    @@ -2254,5 +2279,6 @@ def run_cqt_example():
    # run_dct_vq_example()
    # run_fft_vq_example()
    # run_lpc_example()
    run_cqt_example()
    # run_cqt_example()
    run_fft_dct_example()
    test_all()
  26. kastnerkyle revised this gist Mar 8, 2016. 1 changed file with 171 additions and 10 deletions.
    181 changes: 171 additions & 10 deletions audio_tools.py
    Original file line number Diff line number Diff line change
    @@ -240,6 +240,29 @@ def imdct_slow(X, dctsize=128):


    def nsgcwin(fmin, fmax, n_bins, fs, signal_len, gamma):
    """
    Nonstationary Gabor window calculation
    References
    ----------
    Velasco G. A., Holighaus N., Dörfler M., Grill T.
    Constructing an invertible constant-Q transform with nonstationary Gabor
    frames, Proceedings of the 14th International Conference on Digital
    Audio Effects (DAFx 11), Paris, France, 2011
    Holighaus N., Dörfler M., Velasco G. A. and Grill T.
    A framework for invertible, real-time constant-Q transforms, submitted.
    Original matlab code copyright follows:
    AUTHOR(s) : Monika Dörfler, Gino Angelo Velasco, Nicki Holighaus, 2010-2011
    COPYRIGHT : (c) NUHAG, Dept.Math., University of Vienna, AUSTRIA
    http://nuhag.eu/
    Permission is granted to modify and re-distribute this
    code in any manner as long as this notice is preserved.
    All standard disclaimers apply.
    """
    # use a hanning window
    # no fractional shifts
    fftres = fs / signal_len
    @@ -336,6 +359,29 @@ def _win(numel):


    def nsgtf_real(X, multiscale, shift, window_lens):
    """
    Nonstationary Gabor Transform for real values
    References
    ----------
    Velasco G. A., Holighaus N., Dörfler M., Grill T.
    Constructing an invertible constant-Q transform with nonstationary Gabor
    frames, Proceedings of the 14th International Conference on Digital
    Audio Effects (DAFx 11), Paris, France, 2011
    Holighaus N., Dörfler M., Velasco G. A. and Grill T.
    A framework for invertible, real-time constant-Q transforms, submitted.
    Original matlab code copyright follows:
    AUTHOR(s) : Monika Dörfler, Gino Angelo Velasco, Nicki Holighaus, 2010-2011
    COPYRIGHT : (c) NUHAG, Dept.Math., University of Vienna, AUSTRIA
    http://nuhag.eu/
    Permission is granted to modify and re-distribute this
    code in any manner as long as this notice is preserved.
    All standard disclaimers apply.
    """
    # This will break with multchannel input
    signal_len = len(X)
    N = len(shift)
    @@ -383,7 +429,78 @@ def nsgtf_real(X, multiscale, shift, window_lens):
    return c


    def nsdual(multiscale, shift, window_lens):
    """
    Calculation of nonstationary inverse gabor filters
    References
    ----------
    Velasco G. A., Holighaus N., Dörfler M., Grill T.
    Constructing an invertible constant-Q transform with nonstationary Gabor
    frames, Proceedings of the 14th International Conference on Digital
    Audio Effects (DAFx 11), Paris, France, 2011
    Holighaus N., Dörfler M., Velasco G. A. and Grill T.
    A framework for invertible, real-time constant-Q transforms, submitted.
    Original matlab code copyright follows:
    AUTHOR(s) : Monika Dörfler, Gino Angelo Velasco, Nicki Holighaus, 2010-2011
    COPYRIGHT : (c) NUHAG, Dept.Math., University of Vienna, AUSTRIA
    http://nuhag.eu/
    Permission is granted to modify and re-distribute this
    code in any manner as long as this notice is preserved.
    All standard disclaimers apply.
    """
    N = len(shift)
    posit = np.cumsum(shift)
    seq_len = posit[-1]
    posit = posit - shift[0]

    diagonal = np.zeros((seq_len,))
    win_range = []

    for ii in range(N):
    filt_len = len(multiscale[ii])
    idx = np.arange(-np.floor(filt_len / 2), np.ceil(filt_len / 2))
    win_range.append((posit[ii] + idx) % seq_len)
    subdiag = window_lens[ii] * np.fft.fftshift(multiscale[ii]) ** 2
    ind = win_range[ii].astype(np.int)
    diagonal[ind] = diagonal[ind] + subdiag

    dual_multiscale = multiscale
    for ii in range(N):
    ind = win_range[ii].astype(np.int)
    dual_multiscale[ii] = np.fft.ifftshift(
    np.fft.fftshift(dual_multiscale[ii]) / diagonal[ind])
    return dual_multiscale


    def nsgitf_real(c, c_dc, c_nyq, multiscale, shift):
    """
    Nonstationary Inverse Gabor Transform on real valued signal
    References
    ----------
    Velasco G. A., Holighaus N., Dörfler M., Grill T.
    Constructing an invertible constant-Q transform with nonstationary Gabor
    frames, Proceedings of the 14th International Conference on Digital
    Audio Effects (DAFx 11), Paris, France, 2011
    Holighaus N., Dörfler M., Velasco G. A. and Grill T.
    A framework for invertible, real-time constant-Q transforms, submitted.
    Original matlab code copyright follows:
    AUTHOR(s) : Monika Dörfler, Gino Angelo Velasco, Nicki Holighaus, 2010-2011
    COPYRIGHT : (c) NUHAG, Dept.Math., University of Vienna, AUSTRIA
    http://nuhag.eu/
    Permission is granted to modify and re-distribute this
    code in any manner as long as this notice is preserved.
    All standard disclaimers apply.
    """
    c_l = []
    c_l.append(c_dc)
    c_l.extend([ci for ci in c])
    @@ -424,6 +541,29 @@ def nsgitf_real(c, c_dc, c_nyq, multiscale, shift):


    def cqt(X, fs, n_bins=48, fmin=27.5, fmax="nyq", gamma=20):
    """
    Constant Q Transform
    References
    ----------
    Velasco G. A., Holighaus N., Dörfler M., Grill T.
    Constructing an invertible constant-Q transform with nonstationary Gabor
    frames, Proceedings of the 14th International Conference on Digital
    Audio Effects (DAFx 11), Paris, France, 2011
    Holighaus N., Dörfler M., Velasco G. A. and Grill T.
    A framework for invertible, real-time constant-Q transforms, submitted.
    Original matlab code copyright follows:
    AUTHOR(s) : Monika Dörfler, Gino Angelo Velasco, Nicki Holighaus, 2010-2011
    COPYRIGHT : (c) NUHAG, Dept.Math., University of Vienna, AUSTRIA
    http://nuhag.eu/
    Permission is granted to modify and re-distribute this
    code in any manner as long as this notice is preserved.
    All standard disclaimers apply.
    """
    if fmax == "nyq":
    fmax = fs / 2.
    multiscale, shift, window_lens = nsgcwin(fmin, fmax, n_bins, fs,
    @@ -442,11 +582,35 @@ def cqt(X, fs, n_bins=48, fmin=27.5, fmax="nyq", gamma=20):
    c_nyq = c[-1]
    c_sub = c[1:-1]
    c = np.vstack(c_sub)
    return c, c_dc, c_nyq, multiscale, shift
    return c, c_dc, c_nyq, multiscale, shift, window_lens


    def icqt(X_cq, c_dc, c_nyq, multiscale, shift, window_lens):
    """
    Inverse constant Q Transform
    References
    ----------
    Velasco G. A., Holighaus N., Dörfler M., Grill T.
    Constructing an invertible constant-Q transform with nonstationary Gabor
    frames, Proceedings of the 14th International Conference on Digital
    Audio Effects (DAFx 11), Paris, France, 2011
    def icqt(X_cq, c_dc, c_nyq, multiscale, shift):
    X = nsgitf_real(X_cq, c_dc, c_nyq, multiscale, shift)
    Holighaus N., Dörfler M., Velasco G. A. and Grill T.
    A framework for invertible, real-time constant-Q transforms, submitted.
    Original matlab code copyright follows:
    AUTHOR(s) : Monika Dörfler, Gino Angelo Velasco, Nicki Holighaus, 2010-2011
    COPYRIGHT : (c) NUHAG, Dept.Math., University of Vienna, AUSTRIA
    http://nuhag.eu/
    Permission is granted to modify and re-distribute this
    code in any manner as long as this notice is preserved.
    All standard disclaimers apply.
    """
    new_multiscale = nsdual(multiscale, shift, window_lens)
    X = nsgitf_real(X_cq, c_dc, c_nyq, new_multiscale, shift)
    return X


    @@ -2071,16 +2235,13 @@ def apply_preprocess(list_of_data, clusters):


    def run_cqt_example():
    from matplotlib import pyplot as plt
    fs, d = fetch_sample_file("/Users/User/cqt_resources/kempff1.wav")
    X = d[:44100]
    X_cq, c_dc, c_nyq, multiscale, shift = cqt(X, fs)
    X_r = icqt(X_cq, c_dc, c_nyq, multiscale, shift)
    X_cq, c_dc, c_nyq, multiscale, shift, window_lens = cqt(X, fs)
    X_r = icqt(X_cq, c_dc, c_nyq, multiscale, shift, window_lens)
    SNR = 20 * np.log10(np.linalg.norm(X - X_r) / np.linalg.norm(X))
    wavfile.write("o.wav", fs, soundsc(X))
    wavfile.write("r.wav", fs, soundsc(X_r))
    from IPython import embed; embed()
    raise ValueError()
    wavfile.write("cqt_original.wav", fs, soundsc(X))
    wavfile.write("cqt_reconstruction.wav", fs, soundsc(X_r))


    if __name__ == "__main__":
  27. kastnerkyle revised this gist Mar 7, 2016. 1 changed file with 53 additions and 5 deletions.
    58 changes: 53 additions & 5 deletions audio_tools.py
    Original file line number Diff line number Diff line change
    @@ -383,6 +383,46 @@ def nsgtf_real(X, multiscale, shift, window_lens):
    return c


    def nsgitf_real(c, c_dc, c_nyq, multiscale, shift):
    c_l = []
    c_l.append(c_dc)
    c_l.extend([ci for ci in c])
    c_l.append(c_nyq)

    posit = np.cumsum(shift)
    seq_len = posit[-1]
    posit -= shift[0]
    out = np.zeros((seq_len,)).astype(c_l[1].dtype)

    for ii in range(len(c_l)):
    filt_len = len(multiscale[ii])
    win_range = posit[ii] + np.arange(-np.floor(filt_len / 2),
    np.ceil(filt_len / 2))
    win_range = (win_range % seq_len).astype(np.int)
    temp = np.fft.fft(c_l[ii]) * len(c_l[ii])

    fs_new_bins = len(c_l[ii])
    fk_bins = posit[ii]
    displace = int(fk_bins - np.floor(fk_bins / fs_new_bins) * fs_new_bins)
    temp = np.roll(temp, -displace)
    l = np.arange(len(temp) - np.floor(filt_len / 2), len(temp))
    r = np.arange(np.ceil(filt_len / 2))
    temp_idx = (np.concatenate((l, r)) % len(temp)).astype(np.int)
    temp = temp[temp_idx]
    lf = np.arange(filt_len - np.floor(filt_len / 2), filt_len)
    rf = np.arange(np.ceil(filt_len / 2))
    filt_idx = np.concatenate((lf, rf)).astype(np.int)
    m = multiscale[ii][filt_idx]
    out[win_range] = out[win_range] + m * temp

    nyq_bin = np.floor(seq_len / 2) + 1
    out_idx = np.arange(
    nyq_bin - np.abs(1 - seq_len % 2) - 1, 0, -1).astype(np.int)
    out[nyq_bin:] = np.conj(out[out_idx])
    t_out = np.real(np.fft.ifft(out)).astype(np.float64)
    return t_out


    def cqt(X, fs, n_bins=48, fmin=27.5, fmax="nyq", gamma=20):
    if fmax == "nyq":
    fmax = fs / 2.
    @@ -402,7 +442,12 @@ def cqt(X, fs, n_bins=48, fmin=27.5, fmax="nyq", gamma=20):
    c_nyq = c[-1]
    c_sub = c[1:-1]
    c = np.vstack(c_sub)
    return c, c_dc, c_nyq
    return c, c_dc, c_nyq, multiscale, shift


    def icqt(X_cq, c_dc, c_nyq, multiscale, shift):
    X = nsgitf_real(X_cq, c_dc, c_nyq, multiscale, shift)
    return X


    def rolling_mean(X, window_size):
    @@ -2026,13 +2071,16 @@ def apply_preprocess(list_of_data, clusters):


    def run_cqt_example():
    fs, d = fetch_sample_file("/Users/User/Downloads/kempff1.wav")
    X = d[:44100]
    X_cq, c_dc, c_nyq = cqt(X, fs)
    from matplotlib import pyplot as plt
    fs, d = fetch_sample_file("/Users/User/cqt_resources/kempff1.wav")
    X = d[:44100]
    X_cq, c_dc, c_nyq, multiscale, shift = cqt(X, fs)
    X_r = icqt(X_cq, c_dc, c_nyq, multiscale, shift)
    SNR = 20 * np.log10(np.linalg.norm(X - X_r) / np.linalg.norm(X))
    wavfile.write("o.wav", fs, soundsc(X))
    wavfile.write("r.wav", fs, soundsc(X_r))
    from IPython import embed; embed()
    raise ValueError()
    # icqt(X_cq)


    if __name__ == "__main__":
  28. kastnerkyle revised this gist Mar 4, 2016. 1 changed file with 191 additions and 3 deletions.
    194 changes: 191 additions & 3 deletions audio_tools.py
    Original file line number Diff line number Diff line change
    @@ -80,6 +80,17 @@ def fetch_sample_speech_tapestry():
    return fs, d


    def fetch_sample_file(wav_path):
    if not os.path.exists(wav_path):
    raise ValueError("Unable to find file at path %s" % wav_path)
    fs, d = wavfile.read(wav_path)
    d = d.astype('float32') / (2 ** 15)
    # file is stereo - just choose one channel
    if len(d.shape) > 1:
    d = d[:, 0]
    return fs, d


    def fetch_sample_music():
    url = "http://www.music.helsinki.fi/tmt/opetus/uusmedia/esim/"
    url += "a2002011001-e02-16kHz.wav"
    @@ -228,6 +239,172 @@ def imdct_slow(X, dctsize=128):
    return X


    def nsgcwin(fmin, fmax, n_bins, fs, signal_len, gamma):
    # use a hanning window
    # no fractional shifts
    fftres = fs / signal_len
    fmin = float(fmin)
    fmax = float(fmax)
    gamma = float(gamma)
    nyq = fs / 2.
    b = np.floor(n_bins * np.log2(fmax / fmin))
    fbas = fmin * 2 ** (np.arange(b + 1) / float(n_bins))
    Q = 2 ** (1. / n_bins) - 2 ** (-1. / n_bins)
    cqtbw = Q * fbas + gamma
    cqtbw = cqtbw.ravel()
    maxidx = np.where(fbas + cqtbw / 2. > nyq)[0]
    if len(maxidx) > 0:
    # replicate bug in MATLAB version...
    # or is it a feature
    if sum(maxidx) == 0:
    first = len(cqtbw) - 1
    else:
    first = maxidx[0]
    fbas = fbas[:first]
    cqtbw = cqtbw[:first]
    minidx = np.where(fbas - cqtbw / 2. < 0)[0]
    if len(minidx) > 0:
    fbas = fbas[minidx[-1]+1:]
    cqtbw = cqtbw[minidx[-1]+1:]

    fbas_len = len(fbas)
    fbas_new = np.zeros((2 * (len(fbas) + 1)))
    fbas_new[1:len(fbas) + 1] = fbas
    fbas = fbas_new
    fbas[fbas_len + 1] = nyq
    fbas[fbas_len + 2:] = fs - fbas[1:fbas_len + 1][::-1]
    bw = np.zeros_like(fbas)
    bw[0] = 2 * fmin
    bw[1:len(cqtbw) + 1] = cqtbw
    bw[len(cqtbw) + 1] = fbas[fbas_len + 2] - fbas[fbas_len]
    bw[-len(cqtbw):] = cqtbw[::-1]
    bw = bw / fftres
    fbas = fbas / fftres

    posit = np.zeros_like(fbas)
    posit[:fbas_len + 2] = np.floor(fbas[:fbas_len + 2])
    posit[fbas_len + 2:] = np.ceil(fbas[fbas_len + 2:])
    base_shift = -posit[-1] % signal_len
    shift = np.zeros_like(posit).astype("int32")
    shift[1:] = (posit[1:] - posit[:-1]).astype("int32")
    shift[0] = base_shift

    bw = np.round(bw)
    bwfac = 1
    M = bw

    min_win = 4
    for ii in range(len(bw)):
    if bw[ii] < min_win:
    bw[ii] = min_win
    M[ii] = bw[ii]

    def _win(numel):
    if numel % 2 == 0:
    s1 = np.arange(0, .5, 1. / numel)
    if len(s1) != numel // 2:
    # edge case with small floating point numbers...
    s1 = s1[:-1]
    s2 = np.arange(-.5, 0, 1. / numel)
    if len(s2) != numel // 2:
    # edge case with small floating point numbers...
    s2 = s2[:-1]
    x = np.concatenate((s1, s2))
    else:
    s1 = np.arange(0, .5, 1. / numel)
    s2 = np.arange(-.5 + .5 / numel, 0, 1. / numel)
    if len(s2) != numel // 2: # assume integer truncate 27 // 2 = 13
    s2 = s2[:-1]
    x = np.concatenate((s1, s2))
    assert len(x) == numel
    g = .5 + .5 * np.cos(2 * np.pi * x)
    return g

    multiscale = [_win(bi) for bi in bw]
    bw = bwfac * np.ceil(M / bwfac)

    for kk in [0, fbas_len + 1]:
    if M[kk] > M[kk + 1]:
    multiscale[kk] = np.ones(M[kk]).astype(multiscale[0].dtype)
    i1 = np.floor(M[kk] / 2) - np.floor(M[kk + 1] / 2)
    i2 = np.floor(M[kk] / 2) + np.ceil(M[kk + 1] / 2)
    # Very rarely, gets an off by 1 error? Seems to be at the end...
    # for now, slice
    multiscale[kk][i1:i2] = _win(M[kk + 1])
    multiscale[kk] = multiscale[kk] / np.sqrt(M[kk])
    return multiscale, shift, M


    def nsgtf_real(X, multiscale, shift, window_lens):
    # This will break with multchannel input
    signal_len = len(X)
    N = len(shift)
    X_fft = np.fft.fft(X)

    fill = np.sum(shift) - signal_len
    if fill > 0:
    X_fft_tmp = np.zeros((signal_len + shift))
    X_fft_tmp[:len(X_fft)] = X_fft
    X_fft = X_fft_tmp
    posit = np.cumsum(shift) - shift[0]
    scale_lens = np.array([len(m) for m in multiscale])
    N = np.where(posit - np.floor(scale_lens) <= (signal_len + fill) / 2)[0][-1]
    c = []
    # c[0] is almost exact
    for ii in range(N):
    idx_l = np.arange(np.ceil(scale_lens[ii] / 2), scale_lens[ii])
    idx_r = np.arange(np.ceil(scale_lens[ii] / 2))
    idx = np.concatenate((idx_l, idx_r))
    idx = idx.astype("int32")
    subwin_range = posit[ii] + np.arange(-np.floor(scale_lens[ii] / 2),
    np.ceil(scale_lens[ii] / 2))
    win_range = subwin_range % (signal_len + fill)
    win_range = win_range.astype("int32")
    if window_lens[ii] < scale_lens[ii]:
    raise ValueError("Not handling 'not enough channels' case")
    else:
    temp = np.zeros((window_lens[ii],)).astype(X_fft.dtype)
    temp_idx_l = np.arange(len(temp) - np.floor(scale_lens[ii] / 2),
    len(temp))
    temp_idx_r = np.arange(np.ceil(scale_lens[ii] / 2))
    temp_idx = np.concatenate((temp_idx_l, temp_idx_r))
    temp_idx = temp_idx.astype("int32")
    temp[temp_idx] = X_fft[win_range] * multiscale[ii][idx]
    fs_new_bins = window_lens[ii]
    fk_bins = posit[ii]
    displace = fk_bins - np.floor(fk_bins / fs_new_bins) * fs_new_bins
    displace = displace.astype("int32")
    temp = np.roll(temp, displace)
    c.append(np.fft.ifft(temp))

    if 0:
    # cell2mat concatenation
    c = np.concatenate(c)
    return c


    def cqt(X, fs, n_bins=48, fmin=27.5, fmax="nyq", gamma=20):
    if fmax == "nyq":
    fmax = fs / 2.
    multiscale, shift, window_lens = nsgcwin(fmin, fmax, n_bins, fs,
    len(X), gamma)
    fbas = fs * np.cumsum(shift[1:]) / len(X)
    fbas = fbas[:len(window_lens) // 2 - 1]
    bins = window_lens.shape[0] // 2 - 1
    window_lens[1:bins + 1] = window_lens[bins + 2]
    window_lens[bins + 2:] = window_lens[1:bins + 1][::-1]
    norm = 2. * window_lens[:bins + 2] / float(len(X))
    norm = np.concatenate((norm, norm[1:-1][::-1]))
    multiscale = [norm[ii] * multiscale[ii] for ii in range(2 * (bins + 1))]

    c = nsgtf_real(X, multiscale, shift, window_lens)
    c_dc = c[0]
    c_nyq = c[-1]
    c_sub = c[1:-1]
    c = np.vstack(c_sub)
    return c, c_dc, c_nyq


    def rolling_mean(X, window_size):
    w = 1.0 / window_size * np.ones((window_size))
    return np.correlate(X, w, 'valid')
    @@ -1453,7 +1630,7 @@ def implot(arr, scale=None, title="", cmap="gray"):
    import matplotlib.pyplot as plt
    if scale is "specgram":
    # plotting part
    mag = 10. * np.log10(np.abs(arr))
    mag = 20. * np.log10(np.abs(arr))
    # Transpose so time is X axis, and invert y axis so
    # frequency is low at bottom
    mag = mag.T[::-1, :]
    @@ -1848,14 +2025,25 @@ def apply_preprocess(list_of_data, clusters):
    wavfile.write("phase_vq_test_agc.wav", fs, soundsc(agc_vq_d2))


    def run_cqt_example():
    fs, d = fetch_sample_file("/Users/User/Downloads/kempff1.wav")
    X = d[:44100]
    X_cq, c_dc, c_nyq = cqt(X, fs)
    from matplotlib import pyplot as plt
    from IPython import embed; embed()
    raise ValueError()
    # icqt(X_cq)


    if __name__ == "__main__":
    """
    Trying to run all examples will seg fault on my laptop - probably memory!
    Comment individually
    """
    #run_phase_reconstruction_example()
    # run_phase_reconstruction_example()
    # run_phase_vq_example()
    # run_dct_vq_example()
    run_fft_vq_example()
    # run_fft_vq_example()
    # run_lpc_example()
    run_cqt_example()
    test_all()
  29. kastnerkyle revised this gist Feb 26, 2016. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion audio_tools.py
    Original file line number Diff line number Diff line change
    @@ -1672,7 +1672,7 @@ def apply_preprocess(list_of_data, clusters):
    # Training data
    vq_d1 = apply_preprocess(d1, clusters)
    vq_d2 = apply_preprocess(d2, clusters)
    assert [i != j for i, j in zip(vq_d2.ravel(), vq_d2.ravel())]
    assert [i != j for i, j in zip(vq_d1.ravel(), vq_d2.ravel())]

    fix_d1 = np.concatenate(d1)
    fix_d2 = np.concatenate(d2)
  30. kastnerkyle revised this gist Feb 20, 2016. 1 changed file with 14 additions and 7 deletions.
    21 changes: 14 additions & 7 deletions audio_tools.py
    Original file line number Diff line number Diff line change
    @@ -1619,13 +1619,19 @@ def run_lpc_example():


    def run_fft_vq_example():
    n_fft = 512
    time_smoothing = 4
    def _pre(list_of_data):
    n_fft = 512
    f_c = np.vstack([stft(dd, n_fft) for dd in list_of_data])
    if len(f_c) % time_smoothing != 0:
    newlen = len(f_c) - len(f_c) % time_smoothing
    f_c = f_c[:newlen]
    f_mag = complex_to_abs(f_c)
    f_phs = complex_to_angle(f_c)
    f_sincos = angle_to_sin_cos(f_phs)
    f_r = np.hstack((f_mag, f_sincos))
    f_r = f_r.reshape((len(f_r) // time_smoothing,
    time_smoothing * f_r.shape[1]))
    return f_r, n_fft

    def preprocess_train(list_of_data, random_state):
    @@ -1637,6 +1643,8 @@ def apply_preprocess(list_of_data, clusters):
    f_r, n_fft = _pre(list_of_data)
    memberships, distances = vq(f_r, clusters)
    vq_r = clusters[memberships]
    vq_r = vq_r.reshape((time_smoothing * len(vq_r),
    vq_r.shape[1] // time_smoothing))
    f_mag = vq_r[:, :n_fft // 2]
    f_sincos = vq_r[:, n_fft // 2:]
    extent = f_sincos.shape[1] // 2
    @@ -1647,19 +1655,18 @@ def apply_preprocess(list_of_data, clusters):

    random_state = np.random.RandomState(1999)

    # Doesn't work yet for unknown reasons...
    """
    fs, d = fetch_sample_music()
    sub = int(.8 * d.shape[0])
    d1 = [d[:sub]]
    d2 = [d[sub:]]

    """

    fs, d = fetch_sample_speech_fruit()
    d1 = d[::8] + d[1::8] + d[2::8] + d[3::8] + d[4::8] + d[5::8] + d[6::8]
    d2 = d[7::8]
    # make sure d1 and d2 aren't the same!
    assert [len(di) for di in d1] != [len(di) for di in d2]
    """

    clusters = preprocess_train(d1, random_state)
    # Training data
    @@ -1846,9 +1853,9 @@ def apply_preprocess(list_of_data, clusters):
    Trying to run all examples will seg fault on my laptop - probably memory!
    Comment individually
    """
    # run_phase_reconstruction_example()
    run_phase_vq_example()
    #run_phase_reconstruction_example()
    # run_phase_vq_example()
    # run_dct_vq_example()
    # run_fft_vq_example()
    run_fft_vq_example()
    # run_lpc_example()
    test_all()