Skip to content

Instantly share code, notes, and snippets.

@stevenhao
Created July 18, 2023 20:30
Show Gist options
  • Save stevenhao/4cb2adbb6ec37b16ea2e292ba4a13177 to your computer and use it in GitHub Desktop.
Save stevenhao/4cb2adbb6ec37b16ea2e292ba4a13177 to your computer and use it in GitHub Desktop.

Revisions

  1. stevenhao created this gist Jul 18, 2023.
    123 changes: 123 additions & 0 deletions modal_llama2.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,123 @@
    from modal import Image, Stub, Secret, gpu
    from pathlib import Path
    import os

    MODEL_PATH = "/model"

    def download_models():
    from transformers import AutoTokenizer, AutoModelForCausalLM
    token = os.environ["HUGGINGFACE_TOKEN"]
    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", use_auth_token=token)
    tokenizer.save_pretrained(MODEL_PATH)
    model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", use_auth_token=token)
    model.save_pretrained(MODEL_PATH)


    ## adapted from https://github.com/modal-labs/doppel-bot/blob/main/src/common.py
    # versions might be out of date
    llama2_image = (
    Image.micromamba()
    .micromamba_install(
    "cudatoolkit=11.7",
    "cudnn=8.1.0",
    "cuda-nvcc",
    channels=["conda-forge", "nvidia"],
    )
    .apt_install("git")
    .pip_install(
    "accelerate==0.18.0",
    "bitsandbytes==0.37.0",
    "bitsandbytes-cuda117==0.26.0.post2",
    "datasets==2.10.1",
    "fire==0.5.0",
    "gradio==3.23.0",
    "peft @ git+https://github.com/huggingface/peft.git@e536616888d51b453ed354a6f1e243fecb02ea08",
    "transformers @ git+https://github.com/huggingface/transformers.git@a92e0ad2e20ef4ce28410b5e05c5d63a5a304e65",
    "torch==2.0.0",
    "torchvision==0.15.1",
    "sentencepiece==0.1.97",
    )
    .run_function(download_models, memory=32768, secret=Secret.from_name("hugging-face"), timeout=3600)
    )

    stub = Stub(name="llama2", image=llama2_image)

    @stub.function(
    gpu=gpu.A100(memory=40),
    )
    def main():
    """
    run this function: modal run modal_llama2::main
    prereqs:
    - modal hugging-face secret must be configured correctly
    - you must have access to https://huggingface.co/meta-llama/Llama-2-7b-hf (request access on hugging face & https://ai.meta.com/resources/models-and-libraries/llama-downloads; took me ~1 hr to get approved)
    first run will need to create the image, which takes ~20 mins (downloading ~30 GB from huggingface)
    later runs take < 1 min
    see this notebook for other things you can do with the model: https://github.com/facebookresearch/llama-recipes/blob/main/quickstart.ipynb
    """
    from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaForCausalLM
    import torch
    load_8bit = False
    device = "cuda"

    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model: LlamaForCausalLM = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    load_in_8bit=load_8bit,
    torch_dtype=torch.float16,
    device_map="auto",
    )
    model.eval()
    from transformers import GenerationConfig

    # prompt copied from https://github.com/facebookresearch/llama-recipes/blob/main/quickstart.ipynb
    prompt = """
    Summarize this dialog:
    A: Hi Tom, are you busy tomorrow’s afternoon?
    B: I’m pretty sure I am. What’s up?
    A: Can you go with me to the animal shelter?.
    B: What do you want to do?
    A: I want to get a puppy for my son.
    B: That will make him so happy.
    A: Yeah, we’ve discussed it many times. I think he’s ready now.
    B: That’s good. Raising a dog is a tough issue. Like having a baby ;-)
    A: I'll get him one of those little dogs.
    B: One that won't grow up too big;-)
    A: And eat too much;-))
    B: Do you know which one he would like?
    A: Oh, yes, I took him there last Monday. He showed me one that he really liked.
    B: I bet you had to drag him away.
    A: He wanted to take it home right away ;-).
    B: I wonder what he'll name it.
    A: He said he’d name it after his dead hamster – Lemmy - he's a great Motorhead fan :-)))
    ---
    Summary:
    """

    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)
    # tokens = self.tokenizer.convert_ids_to_tokens(input_ids[0])
    # print(tokens)
    generation_config = GenerationConfig()

    with torch.no_grad():
    generation_output = model.generate(
    input_ids=input_ids,
    generation_config=generation_config,
    # parameters below are set arbitrarily; a lot are just defaults
    return_dict_in_generate=True,
    output_scores=True,
    do_sample=True,
    temperature=0.3,
    top_p=0.85,
    top_k=40,
    num_beams=1,
    max_new_tokens=600,
    repetition_penalty=1.2,
    )

    s = generation_output.sequences[0]
    run_output = tokenizer.decode(s)
    print("Run output:", run_output)
    return run_output