|
|
@@ -0,0 +1,119 @@ |
|
|
import os |
|
|
import numpy as np |
|
|
from scipy.interpolate import interp1d |
|
|
from scipy.io import wavfile |
|
|
import matplotlib.pyplot as plt |
|
|
import PIL.Image |
|
|
import moviepy.editor |
|
|
|
|
|
import dnnlib |
|
|
import dnnlib.tflib as tflib |
|
|
import pretrained_networks |
|
|
|
|
|
audio_dirname = 'data' |
|
|
audio = {} |
|
|
fps = 60 |
|
|
|
|
|
# https://www.google.com/search?q=death+grips+black+google+download |
|
|
for mp3_filename in [f for f in os.listdir(audio_dirname) if f.endswith('.mp3')]: |
|
|
mp3_filename = f'{audio_dirname}/{mp3_filename}' |
|
|
wav_filename = mp3_filename[:-4] + '.wav' |
|
|
if not os.path.exists(wav_filename): |
|
|
audio_clip = moviepy.editor.AudioFileClip(mp3_filename) |
|
|
audio_clip.write_audiofile(wav_filename, fps=44100, nbytes=2, codec='pcm_s16le') |
|
|
track_name = os.path.basename(wav_filename)[15:-5] |
|
|
rate, signal = wavfile.read(wav_filename) |
|
|
signal = np.mean(signal, axis=1) # to mono |
|
|
signal = np.abs(signal) |
|
|
seed = signal.shape[0] |
|
|
duration = signal.shape[0] / rate |
|
|
frames = int(np.ceil(duration * fps)) |
|
|
samples_per_frame = signal.shape[0] / frames |
|
|
audio[track_name] = np.zeros(frames, dtype=signal.dtype) |
|
|
for frame in range(frames): |
|
|
start = int(round(frame * samples_per_frame)) |
|
|
stop = int(round((frame + 1) * samples_per_frame)) |
|
|
audio[track_name][frame] = np.mean(signal[start:stop], axis=0) |
|
|
audio[track_name] /= max(audio[track_name]) |
|
|
|
|
|
for track in sorted(audio.keys()): |
|
|
plt.figure(figsize=(8, 3)) |
|
|
plt.title(track) |
|
|
plt.plot(audio[track]) |
|
|
plt.savefig(f'data/{track}.png') |
|
|
|
|
|
network_pkl = 'gdrive:networks/stylegan2-ffhq-config-f.pkl' |
|
|
_G, _D, Gs = pretrained_networks.load_networks(network_pkl) |
|
|
|
|
|
Gs_kwargs = dnnlib.EasyDict() |
|
|
Gs_kwargs.output_transform = dict(func=tflib.convert_images_to_uint8, nchw_to_nhwc=True) |
|
|
Gs_kwargs.randomize_noise = False |
|
|
Gs_syn_kwargs = dnnlib.EasyDict() |
|
|
Gs_syn_kwargs.output_transform = dict(func=tflib.convert_images_to_uint8, nchw_to_nhwc=True) |
|
|
Gs_syn_kwargs.randomize_noise = False |
|
|
Gs_syn_kwargs.minibatch_size = 4 |
|
|
noise_vars = [var for name, var in Gs.components.synthesis.vars.items() if name.startswith('noise')] |
|
|
w_avg = Gs.get_var('dlatent_avg') |
|
|
|
|
|
def get_ws(n, frames, seed): |
|
|
filename = f'data/ws_{n}_{frames}_{seed}.npy' |
|
|
if not os.path.exists(filename): |
|
|
src_ws = np.random.RandomState(seed).randn(n, 512) |
|
|
ws = np.empty((frames, 512)) |
|
|
for i in range(512): |
|
|
# FIXME: retarded |
|
|
x = np.linspace(0, 3*frames, 3*len(src_ws), endpoint=False) |
|
|
y = np.tile(src_ws[:, i], 3) |
|
|
x_ = np.linspace(0, 3*frames, 3*frames, endpoint=False) |
|
|
y_ = interp1d(x, y, kind='quadratic', fill_value='extrapolate')(x_) |
|
|
ws[:, i] = y_[frames:2*frames] |
|
|
np.save(filename, ws) |
|
|
else: |
|
|
ws = np.load(filename) |
|
|
return ws |
|
|
|
|
|
def mix_styles(wa, wb, ivs): |
|
|
w = np.copy(wa) |
|
|
for i, v in ivs: |
|
|
w[i] = wa[i] * (1 - v) + wb[i] * v |
|
|
return w |
|
|
|
|
|
def normalize_vector(v): |
|
|
return v * np.std(w_avg) / np.std(v) + np.mean(w_avg) - np.mean(v) |
|
|
|
|
|
def render_frame(t): |
|
|
global base_index |
|
|
frame = np.clip(np.int(np.round(t * fps)), 0, frames - 1) |
|
|
base_index += base_speed * audio['Instrumental'][frame]**2 |
|
|
base_w = base_ws[int(round(base_index)) % len(base_ws)] |
|
|
base_w = np.tile(base_w, (18, 1)) |
|
|
psi = 0.5 + audio['FX'][frame] / 2 |
|
|
base_w = w_avg + (base_w - w_avg) * psi |
|
|
mix_w = np.tile(mix_ws[frame], (18, 1)) |
|
|
mix_w = w_avg + (mix_w - w_avg) * 0.75 |
|
|
ranges = [range(0, 4), range(4, 8), range(8, 18)] |
|
|
values = [audio[track][frame] for track in ['Drums', 'E Drums', 'Synth']] |
|
|
w = mix_styles(base_w, mix_w, zip(ranges, values)) |
|
|
w += mouth_open * audio['Vocal'][frame] * 1.5 |
|
|
image = Gs.components.synthesis.run(np.stack([w]), **Gs_syn_kwargs)[0] |
|
|
image = PIL.Image.fromarray(image).resize((size, size), PIL.Image.LANCZOS) |
|
|
return np.array(image) |
|
|
|
|
|
size = 1080 |
|
|
seconds = int(np.ceil(duration)) |
|
|
resolution = 10 |
|
|
base_frames = resolution * frames |
|
|
base_ws = get_ws(seconds, base_frames, seed) |
|
|
base_speed = base_frames / sum(audio['Instrumental']**2) |
|
|
base_index = 0 |
|
|
mix_ws = get_ws(seconds, frames, seed + 1) |
|
|
# https://rolux.org/media/stylegan2/vectors/mouth_ratio.npy |
|
|
mouth_open = normalize_vector(-np.load('data/mouth_ratio.npy')) |
|
|
|
|
|
mp4_filename = 'data/Culture Shock.mp4' |
|
|
video_clip = moviepy.editor.VideoClip(render_frame, duration=duration) |
|
|
audio_clip_i = moviepy.editor.AudioFileClip('data/Culture Shock (Instrumental).wav') |
|
|
audio_clip_v = moviepy.editor.AudioFileClip('data/Culture Shock (Vocal).wav') |
|
|
audio_clip = moviepy.editor.CompositeAudioClip([audio_clip_i, audio_clip_v]) |
|
|
video_clip = video_clip.set_audio(audio_clip) |
|
|
video_clip.write_videofile(mp4_filename, fps=fps, codec='libx264', audio_codec='aac', bitrate='8M') |