stevenhao · July 18, 2023 20:30 · Jul 18, 2023
diff --git a/modal_llama2.py b/modal_llama2.py
@@ -0,0 +1,123 @@
+from modal import Image, Stub, Secret, gpu
+from pathlib import Path
+import os
+
+MODEL_PATH = "/model"
+
+def download_models():
+    from transformers import AutoTokenizer, AutoModelForCausalLM
+    token = os.environ["HUGGINGFACE_TOKEN"]
+    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", use_auth_token=token)
+    tokenizer.save_pretrained(MODEL_PATH)
+    model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", use_auth_token=token)
+    model.save_pretrained(MODEL_PATH)
+
+
+## adapted from https://github.com/modal-labs/doppel-bot/blob/main/src/common.py
+# versions might be out of date
+llama2_image = (
+    Image.micromamba()
+    .micromamba_install(
+        "cudatoolkit=11.7",
+        "cudnn=8.1.0",
+        "cuda-nvcc",
+        channels=["conda-forge", "nvidia"],
+    )
+    .apt_install("git")
+    .pip_install(
+        "accelerate==0.18.0",
+        "bitsandbytes==0.37.0",
+        "bitsandbytes-cuda117==0.26.0.post2",
+        "datasets==2.10.1",
+        "fire==0.5.0",
+        "gradio==3.23.0",
+        "peft @ git+https://github.com/huggingface/peft.git@e536616888d51b453ed354a6f1e243fecb02ea08",
+        "transformers @ git+https://github.com/huggingface/transformers.git@a92e0ad2e20ef4ce28410b5e05c5d63a5a304e65",
+        "torch==2.0.0",
+        "torchvision==0.15.1",
+        "sentencepiece==0.1.97",
+    )
+    .run_function(download_models, memory=32768, secret=Secret.from_name("hugging-face"), timeout=3600)
+)
+
+stub = Stub(name="llama2", image=llama2_image)
+
+@stub.function(
+    gpu=gpu.A100(memory=40),
+)
+def main():
+    """
+    run this function: modal run modal_llama2::main
+    prereqs:
+    - modal hugging-face secret must be configured correctly
+    - you must have access to https://huggingface.co/meta-llama/Llama-2-7b-hf (request access on hugging face & https://ai.meta.com/resources/models-and-libraries/llama-downloads; took me ~1 hr to get approved)
+
+        first run will need to create the image, which takes ~20 mins (downloading ~30 GB from huggingface) 
+    later runs take < 1 min
+    see this notebook for other things you can do with the model: https://github.com/facebookresearch/llama-recipes/blob/main/quickstart.ipynb
+    """
+    from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaForCausalLM
+    import torch
+    load_8bit = False
+    device = "cuda"
+
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
+    model: LlamaForCausalLM = AutoModelForCausalLM.from_pretrained(
+        MODEL_PATH,
+        load_in_8bit=load_8bit,
+        torch_dtype=torch.float16,
+        device_map="auto",
+    )
+    model.eval()
+    from transformers import GenerationConfig
+
+    # prompt copied from https://github.com/facebookresearch/llama-recipes/blob/main/quickstart.ipynb
+    prompt = """
+Summarize this dialog:
+A: Hi Tom, are you busy tomorrow’s afternoon?
+B: I’m pretty sure I am. What’s up?
+A: Can you go with me to the animal shelter?.
+B: What do you want to do?
+A: I want to get a puppy for my son.
+B: That will make him so happy.
+A: Yeah, we’ve discussed it many times. I think he’s ready now.
+B: That’s good. Raising a dog is a tough issue. Like having a baby ;-) 
+A: I'll get him one of those little dogs.
+B: One that won't grow up too big;-)
+A: And eat too much;-))
+B: Do you know which one he would like?
+A: Oh, yes, I took him there last Monday. He showed me one that he really liked.
+B: I bet you had to drag him away.
+A: He wanted to take it home right away ;-).
+B: I wonder what he'll name it.
+A: He said he’d name it after his dead hamster – Lemmy  - he's  a great Motorhead fan :-)))
+---
+Summary:
+"""
+
+    inputs = tokenizer(prompt, return_tensors="pt")
+    input_ids = inputs["input_ids"].to(device)
+    # tokens = self.tokenizer.convert_ids_to_tokens(input_ids[0])
+    # print(tokens)
+    generation_config = GenerationConfig()
+
+    with torch.no_grad():
+        generation_output = model.generate(
+            input_ids=input_ids,
+            generation_config=generation_config,
+            # parameters below are set arbitrarily; a lot are just defaults
+            return_dict_in_generate=True,
+            output_scores=True,
+            do_sample=True,
+            temperature=0.3,
+            top_p=0.85,
+            top_k=40,
+            num_beams=1,
+            max_new_tokens=600,
+            repetition_penalty=1.2,
+        )
+
+        s = generation_output.sequences[0]
+        run_output = tokenizer.decode(s)
+        print("Run output:", run_output)
+        return run_output