-
-
Save pythoninoffice/39d7b939a3b99d42017203443abca638 to your computer and use it in GitHub Desktop.
Revisions
-
karpathy revised this gist
Aug 19, 2022 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -138,7 +138,7 @@ def run( seed = 1337, # -------------------------------------- # args you probably don't want to change quality = 90, # for jpeg compression of the output images eta = 0.0, width = 512, height = 512, -
karpathy revised this gist
Aug 19, 2022 . 1 changed file with 11 additions and 9 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -69,7 +69,6 @@ def diffuse( for i, t in enumerate(pipe.scheduler.timesteps): # expand the latents for classifier free guidance latent_model_input = torch.cat([cond_latents] * 2) if isinstance(pipe.scheduler, LMSDiscreteScheduler): sigma = pipe.scheduler.sigmas[i] @@ -83,7 +82,6 @@ def diffuse( noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 if isinstance(pipe.scheduler, LMSDiscreteScheduler): cond_latents = pipe.scheduler.step(noise_pred, i, cond_latents, **extra_step_kwargs)["prev_sample"] else: @@ -134,11 +132,13 @@ def run( name = 'blueberry', # name of this project, for the output directory rootdir = '/home/ubuntu/dreams', num_steps = 200, # number of steps between each pair of sampled points max_frames = 10000, # number of frames to write and then exit the script num_inference_steps = 50, # more (e.g. 100, 200 etc) can create slightly better images guidance_scale = 7.5, # can depend on the prompt. usually somewhere between 3-10 is good seed = 1337, # -------------------------------------- # args you probably don't want to change quality = 0.9, # for jpeg compression of the output images eta = 0.0, width = 512, height = 512, @@ -147,15 +147,17 @@ def run( ): assert torch.cuda.is_available() assert height % 8 == 0 and width % 8 == 0 torch.manual_seed(seed) torch_device = f"cuda:{gpu}" # init the output dir outdir = os.path.join(rootdir, name) os.makedirs(outdir, exist_ok=True) # init all of the models and move them to a given GPU lms = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear") pipe = StableDiffusionPipeline.from_pretrained(weights_path, scheduler=lms, use_auth_token=True) pipe.unet.to(torch_device) pipe.vae.to(torch_device) pipe.text_encoder.to(torch_device) @@ -182,7 +184,7 @@ def run( image = diffuse(pipe, cond_embeddings, init, num_inference_steps, guidance_scale, eta) im = Image.fromarray(image) outpath = os.path.join(outdir, 'frame%06d.jpg' % frame_index) im.save(outpath, quality=quality) frame_index += 1 init1 = init2 -
karpathy revised this gist
Aug 17, 2022 . 1 changed file with 5 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -2,8 +2,12 @@ stable diffusion dreaming creates hypnotic moving videos by smoothly walking randomly through the sample space example way to run this script: $ python stablediffusionwalk.py --prompt "blueberry spaghetti" --name blueberry to stitch together the images, e.g.: $ ffmpeg -r 10 -f image2 -s 512x512 -i blueberry/frame%06d.jpg -vcodec libx264 -crf 10 -pix_fmt yuv420p blueberry.mp4 nice slerp def from @xsteenbrugge ty you have to have access to stablediffusion checkpoints from https://huggingface.co/CompVis -
karpathy revised this gist
Aug 17, 2022 . 1 changed file with 128 additions and 66 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,20 +1,20 @@ """ stable diffusion dreaming creates hypnotic moving videos by smoothly walking randomly through the sample space to stitch together the images, e.g.: $ ffmpeg -r 10 -f image2 -s 512x512 -i out/frame%04d.jpg -vcodec libx264 -crf 10 -pix_fmt yuv420p test.mp4 nice slerp def from @xsteenbrugge ty you have to have access to stablediffusion checkpoints from https://huggingface.co/CompVis and install all the other dependencies (e.g. diffusers library) """ import os import inspect import fire from diffusers import StableDiffusionPipeline from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler from time import time from PIL import Image from einops import rearrange @@ -23,62 +23,81 @@ from torch import autocast from torchvision.utils import make_grid # ----------------------------------------------------------------------------- @torch.no_grad() def diffuse( pipe, cond_embeddings, # text conditioning, should be (1, 77, 768) cond_latents, # image conditioning, should be (1, 4, 64, 64) num_inference_steps, guidance_scale, eta, ): torch_device = cond_latents.get_device() # classifier guidance: add the unconditional embedding max_length = cond_embeddings.shape[1] # 77 uncond_input = pipe.tokenizer([""], padding="max_length", max_length=max_length, return_tensors="pt") uncond_embeddings = pipe.text_encoder(uncond_input.input_ids.to(torch_device))[0] text_embeddings = torch.cat([uncond_embeddings, cond_embeddings]) # if we use LMSDiscreteScheduler, let's make sure latents are mulitplied by sigmas if isinstance(pipe.scheduler, LMSDiscreteScheduler): cond_latents = cond_latents * pipe.scheduler.sigmas[0] # init the scheduler accepts_offset = "offset" in set(inspect.signature(pipe.scheduler.set_timesteps).parameters.keys()) extra_set_kwargs = {} if accepts_offset: extra_set_kwargs["offset"] = 1 pipe.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs) # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] accepts_eta = "eta" in set(inspect.signature(pipe.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # diffuse! for i, t in enumerate(pipe.scheduler.timesteps): # expand the latents for classifier free guidance # TODO: gross much??? latent_model_input = torch.cat([cond_latents] * 2) if isinstance(pipe.scheduler, LMSDiscreteScheduler): sigma = pipe.scheduler.sigmas[i] latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5) # predict the noise residual noise_pred = pipe.unet(latent_model_input, t, encoder_hidden_states=text_embeddings)["sample"] # cfg noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 # TODO: omfg... if isinstance(pipe.scheduler, LMSDiscreteScheduler): cond_latents = pipe.scheduler.step(noise_pred, i, cond_latents, **extra_step_kwargs)["prev_sample"] else: cond_latents = pipe.scheduler.step(noise_pred, t, cond_latents, **extra_step_kwargs)["prev_sample"] # scale and decode the image latents with vae cond_latents = 1 / 0.18215 * cond_latents image = pipe.vae.decode(cond_latents) # generate output numpy image as uint8 image = (image / 2 + 0.5).clamp(0, 1) image = image.cpu().permute(0, 2, 3, 1).numpy() image = (image[0] * 255).astype(np.uint8) return image def slerp(t, v0, v1, DOT_THRESHOLD=0.9995): """ helper function to spherically interpolate two arrays v1 v2 """ if not isinstance(v0, np.ndarray): inputs_are_torch = True @@ -103,24 +122,67 @@ def slerp(t, v0, v1, DOT_THRESHOLD=0.9995): return v2 def run( # -------------------------------------- # args you probably want to change prompt = "blueberry spaghetti", # prompt to dream about gpu = 0, # id of the gpu to run on name = 'blueberry', # name of this project, for the output directory rootdir = '/home/ubuntu/dreams', num_steps = 200, # number of steps between each pair of sampled points max_frames = 10000, # number of frames to write # -------------------------------------- # args you probably don't want to change num_inference_steps = 50, guidance_scale = 7.5, eta = 0.0, width = 512, height = 512, weights_path = "/home/ubuntu/stable-diffusion-v1-3-diffusers", # -------------------------------------- ): assert torch.cuda.is_available() assert height % 8 == 0 and width % 8 == 0 torch.manual_seed(1337) # init the output dir outdir = os.path.join(rootdir, name) os.makedirs(outdir, exist_ok=True) # init all of the models and move them to a given GPU pipe = StableDiffusionPipeline.from_pretrained(weights_path, use_auth_token=True) torch_device = f"cuda:{gpu}" pipe.unet.to(torch_device) pipe.vae.to(torch_device) pipe.text_encoder.to(torch_device) # get the conditional text embeddings based on the prompt text_input = pipe.tokenizer(prompt, padding="max_length", max_length=pipe.tokenizer.model_max_length, truncation=True, return_tensors="pt") cond_embeddings = pipe.text_encoder(text_input.input_ids.to(torch_device))[0] # shape [1, 77, 768] # sample a source init1 = torch.randn((1, pipe.unet.in_channels, height // 8, width // 8), device=torch_device) # iterate the loop frame_index = 0 while frame_index < max_frames: # sample the destination init2 = torch.randn((1, pipe.unet.in_channels, height // 8, width // 8), device=torch_device) for i, t in enumerate(np.linspace(0, 1, num_steps)): init = slerp(float(t), init1, init2) print("dreaming... ", frame_index) with autocast("cuda"): image = diffuse(pipe, cond_embeddings, init, num_inference_steps, guidance_scale, eta) im = Image.fromarray(image) outpath = os.path.join(outdir, 'frame%06d.jpg' % frame_index) im.save(outpath) frame_index += 1 init1 = init2 if __name__ == '__main__': fire.Fire(run) -
karpathy revised this gist
Aug 16, 2022 . 1 changed file with 2 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -115,7 +115,8 @@ def slerp(t, v0, v1, DOT_THRESHOLD=0.9995): for i, t in enumerate(np.linspace(0, 1, 200)): init = slerp(float(t), init1, init2) with autocast("cuda"): image = diffuse(text_embeddings, init, guidance_scale=10.0) im = Image.fromarray((image[0] * 255).astype(np.uint8)) im.save('/home/ubuntu/out/frame%06d.jpg' % n) print('dreaming... ', n) -
karpathy revised this gist
Aug 16, 2022 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -117,7 +117,7 @@ def slerp(t, v0, v1, DOT_THRESHOLD=0.9995): init = slerp(float(t), init1, init2) image = diffuse(text_embeddings, init, guidance_scale=10.0) im = Image.fromarray((image[0] * 255).astype(np.uint8)) im.save('/home/ubuntu/out/frame%06d.jpg' % n) print('dreaming... ', n) n += 1 -
karpathy revised this gist
Aug 16, 2022 . 1 changed file with 2 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -10,6 +10,8 @@ THIS FILE IS HACKY AND NOT CONFIGURABLE READ THE CODE, MAKE EDITS TO PATHS AND SETTINGS YOU LIKE nice slerp def from @xsteenbrugge ty you have to have access to stablediffusion checkpoints from https://huggingface.co/CompVis and install all the other dependencies (e.g. diffusers library) """ from diffusers import StableDiffusionPipeline -
karpathy created this gist
Aug 16, 2022 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,123 @@ """ draws many samples from a diffusion model by slerp'ing around the noise space, and dumps frames to a directory. You can then stitch up the frames with e.g.: $ ffmpeg -r 10 -f image2 -s 512x512 -i out/frame%04d.jpg -vcodec libx264 -crf 10 -pix_fmt yuv420p test.mp4 THIS FILE IS HACKY AND NOT CONFIGURABLE READ THE CODE, MAKE EDITS TO PATHS AND SETTINGS YOU LIKE THIS FILE IS HACKY AND NOT CONFIGURABLE READ THE CODE, MAKE EDITS TO PATHS AND SETTINGS YOU LIKE THIS FILE IS HACKY AND NOT CONFIGURABLE READ THE CODE, MAKE EDITS TO PATHS AND SETTINGS YOU LIKE nice slerp def from @xsteenbrugge ty """ from diffusers import StableDiffusionPipeline from time import time from PIL import Image from einops import rearrange import numpy as np import torch from torch import autocast from torchvision.utils import make_grid torch.manual_seed(42) pipe = StableDiffusionPipeline.from_pretrained("/home/ubuntu/stable-diffusion-v1-3-diffusers", use_auth_token=True) torch_device = 'cuda:3' pipe.unet.to(torch_device) pipe.vae.to(torch_device) pipe.text_encoder.to(torch_device) print('w00t') batch_size = 1 height = 512 width = 512 prompt = ["ultrarealistic steam punk neural network machine in the shape of a brain, placed on a pedestal, covered with neurons made of gears. dramatic lighting. #unrealengine"] * 1 text_input = pipe.tokenizer(prompt, padding=True, truncation=True, return_tensors="pt") text_embeddings = pipe.text_encoder(text_input.input_ids.to(torch_device))[0] @torch.no_grad() def diffuse(text_embeddings, init, guidance_scale = 7.5): # text_embeddings are n,t,d max_length = text_embeddings.shape[1] uncond_input = pipe.tokenizer([""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt") uncond_embeddings = pipe.text_encoder(uncond_input.input_ids.to(torch_device))[0] text_embeddings = torch.cat([uncond_embeddings, text_embeddings]) latents = init.clone() num_inference_steps = 50 pipe.scheduler.set_timesteps(num_inference_steps) for t in pipe.scheduler.timesteps: # predict the noise residual latent_model_input = torch.cat([latents] * 2) # for cfg noise_pred = pipe.unet(latent_model_input, t, encoder_hidden_states=text_embeddings)["sample"] # perform guidance noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 latents = pipe.scheduler.step(noise_pred, t, latents)["prev_sample"] # post-process latents = 1 / 0.18215 * latents image = pipe.vae.decode(latents) image = (image / 2 + 0.5).clamp(0, 1) image = image.cpu().permute(0, 2, 3, 1).numpy() return image def slerp(t, v0, v1, DOT_THRESHOLD=0.9995): if not isinstance(v0, np.ndarray): inputs_are_torch = True input_device = v0.device v0 = v0.cpu().numpy() v1 = v1.cpu().numpy() dot = np.sum(v0 * v1 / (np.linalg.norm(v0) * np.linalg.norm(v1))) if np.abs(dot) > DOT_THRESHOLD: v2 = (1 - t) * v0 + t * v1 else: theta_0 = np.arccos(dot) sin_theta_0 = np.sin(theta_0) theta_t = theta_0 * t sin_theta_t = np.sin(theta_t) s0 = np.sin(theta_0 - theta_t) / sin_theta_0 s1 = sin_theta_t / sin_theta_0 v2 = s0 * v0 + s1 * v1 if inputs_are_torch: v2 = torch.from_numpy(v2).to(input_device) return v2 # DREAM # sample start init1 = torch.randn((batch_size, pipe.unet.in_channels, height // 8, width // 8)).to(torch_device) n = 0 while True: # sample destination init2 = torch.randn((batch_size, pipe.unet.in_channels, height // 8, width // 8)).to(torch_device) for i, t in enumerate(np.linspace(0, 1, 200)): init = slerp(float(t), init1, init2) image = diffuse(text_embeddings, init, guidance_scale=10.0) im = Image.fromarray((image[0] * 255).astype(np.uint8)) im.save('/home/ubuntu/out/frame%04d.jpg' % n) print('dreaming... ', n) n += 1 init1 = init2