Last active
June 13, 2024 13:01
-
-
Save sroecker/5adbf966d9eba5522df1c56f9bf7a818 to your computer and use it in GitHub Desktop.
Revisions
-
sroecker renamed this gist
Jun 11, 2024 . 1 changed file with 4 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,6 +1,6 @@ import modal app = modal.App(name="moondream-label-datikz_v2") data_dict = modal.Dict.from_name("HF_DATASET", create_if_missing=True) def download_dataset(): @@ -70,6 +70,7 @@ def label_dataset(split): # load HF dataset from datasets import load_dataset ds = load_dataset(data_dict["HF_DATASET"], split=split, keep_in_memory=True) #ds = ds.select(range(100)) # for debugging print(len(ds)) # Batch size @@ -85,7 +86,6 @@ def batches(lst, n): from datasets import Image img_enc = Image() r = [] for batch in batches(ds, N): prompts = ["Describe this diagram using the following context, excluding anything that is not directly deducible from the graph: "+c[:1280] for c in batch['caption']] @@ -124,6 +124,8 @@ def main(): result_df = pd.concat(results) result_ds = Dataset.from_pandas(result_df) # properly cast image column from datasets import Image img_enc = Image() result_ds = result_ds.cast_column("image", Image()) print(result_ds) -
sroecker created this gist
Jun 11, 2024 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,131 @@ import modal app = modal.App(name="label-datikz_v2") data_dict = modal.Dict.from_name("HF_DATASET", create_if_missing=True) def download_dataset(): from datasets import load_dataset data_dict["HF_DATASET"] = "nllg/datikz-v2" dataset = load_dataset(data_dict["HF_DATASET"]) def download_model(): model_id = "vikhyatk/moondream2" revision = "2024-05-20" from transformers import AutoModelForCausalLM, AutoTokenizer model = AutoModelForCausalLM.from_pretrained( model_id, trust_remote_code=True, revision=revision, ) tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision) moondream_image = modal.Image.micromamba( python_version="3.11" ).apt_install( "git" ).micromamba_install( "cudatoolkit", "cudnn", "cuda-nvcc", channels=["conda-forge", "nvidia"], ).pip_install( "torch", "torchvision", "accelerate", "transformers", "datasets", "einops", "Pillow", "xxhash", gpu="A100" ).run_commands( "pip install flash-attn --no-build-isolation" ).run_function( download_dataset ).run_function(download_model) @app.function(gpu="A100", image=moondream_image, timeout=3600) def label_dataset(split): import torch import pandas as pd from transformers import AutoModelForCausalLM, AutoTokenizer import xxhash # load moondream model model_id = "vikhyatk/moondream2" revision = "2024-05-20" model = AutoModelForCausalLM.from_pretrained( model_id, trust_remote_code=True, revision=revision, device_map = 'cuda', torch_dtype=torch.float16, attn_implementation="flash_attention_2", ).to("cuda") tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision) print("torch.cuda.memory_allocated: %fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024)) print("torch.cuda.memory_reserved: %fGB"%(torch.cuda.memory_reserved(0)/1024/1024/1024)) print("torch.cuda.max_memory_reserved: %fGB"%(torch.cuda.max_memory_reserved(0)/1024/1024/1024)) # load HF dataset from datasets import load_dataset ds = load_dataset(data_dict["HF_DATASET"], split=split, keep_in_memory=True) print(len(ds)) # Batch size #N=12 # Fits in 16G VRAM when truncating prompt N=26 # Fits into 40GB VRAM # simple mini batch generator def batches(lst, n): for i in range(0, len(lst), n): yield lst[i:i + n] import pandas as pd from datasets import Image img_enc = Image() r = [] for batch in batches(ds, N): prompts = ["Describe this diagram using the following context, excluding anything that is not directly deducible from the graph: "+c[:1280] for c in batch['caption']] answers = model.batch_answer( images=batch['image'], prompts=prompts, tokenizer=tokenizer, repetition_penalty=1.2, # Important to avoid repetitions, chosen value might not be best ) r.append(pd.DataFrame({'caption': answers, 'orig_caption': batch['caption'], 'image': [img_enc.encode_example(img) for img in batch['image']]} )) if len(r) % 10 == 0: print(len(r)) print("torch.cuda.max_memory_allocated: %fGB"%(torch.cuda.max_memory_allocated(0)/1024/1024/1024)) return pd.concat(r) @app.local_entrypoint() def main(): import pandas as pd from datasets import load_dataset, Dataset # split dataset into 10 equal parts #splits = [f'train[{k}%:{k+10}%]' for k in range(0, 100, 10)] # split dataset into 100 equal parts splits = [f'train[{k}%:{k+2}%]' for k in range(0, 100, 2)] print(splits) results = [] #for part_result in label_dataset.map([splits[0]]): # for debugging for part_result in label_dataset.map(splits): results.append(part_result) print(len(part_result)) # concatenate the list of part_results and load as HF ds result_df = pd.concat(results) result_ds = Dataset.from_pandas(result_df) # properly cast image column result_ds = result_ds.cast_column("image", Image()) print(result_ds) # push result to HF result_ds.push_to_hub('datikz-v2-moondream-labels')