Created
          July 18, 2023 20:30 
        
      - 
      
- 
        Save stevenhao/4cb2adbb6ec37b16ea2e292ba4a13177 to your computer and use it in GitHub Desktop. 
Revisions
- 
        stevenhao created this gist Jul 18, 2023 .There are no files selected for viewingThis file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,123 @@ from modal import Image, Stub, Secret, gpu from pathlib import Path import os MODEL_PATH = "/model" def download_models(): from transformers import AutoTokenizer, AutoModelForCausalLM token = os.environ["HUGGINGFACE_TOKEN"] tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", use_auth_token=token) tokenizer.save_pretrained(MODEL_PATH) model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", use_auth_token=token) model.save_pretrained(MODEL_PATH) ## adapted from https://github.com/modal-labs/doppel-bot/blob/main/src/common.py # versions might be out of date llama2_image = ( Image.micromamba() .micromamba_install( "cudatoolkit=11.7", "cudnn=8.1.0", "cuda-nvcc", channels=["conda-forge", "nvidia"], ) .apt_install("git") .pip_install( "accelerate==0.18.0", "bitsandbytes==0.37.0", "bitsandbytes-cuda117==0.26.0.post2", "datasets==2.10.1", "fire==0.5.0", "gradio==3.23.0", "peft @ git+https://github.com/huggingface/peft.git@e536616888d51b453ed354a6f1e243fecb02ea08", "transformers @ git+https://github.com/huggingface/transformers.git@a92e0ad2e20ef4ce28410b5e05c5d63a5a304e65", "torch==2.0.0", "torchvision==0.15.1", "sentencepiece==0.1.97", ) .run_function(download_models, memory=32768, secret=Secret.from_name("hugging-face"), timeout=3600) ) stub = Stub(name="llama2", image=llama2_image) @stub.function( gpu=gpu.A100(memory=40), ) def main(): """ run this function: modal run modal_llama2::main prereqs: - modal hugging-face secret must be configured correctly - you must have access to https://huggingface.co/meta-llama/Llama-2-7b-hf (request access on hugging face & https://ai.meta.com/resources/models-and-libraries/llama-downloads; took me ~1 hr to get approved) first run will need to create the image, which takes ~20 mins (downloading ~30 GB from huggingface) later runs take < 1 min see this notebook for other things you can do with the model: https://github.com/facebookresearch/llama-recipes/blob/main/quickstart.ipynb """ from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaForCausalLM import torch load_8bit = False device = "cuda" tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) model: LlamaForCausalLM = AutoModelForCausalLM.from_pretrained( MODEL_PATH, load_in_8bit=load_8bit, torch_dtype=torch.float16, device_map="auto", ) model.eval() from transformers import GenerationConfig # prompt copied from https://github.com/facebookresearch/llama-recipes/blob/main/quickstart.ipynb prompt = """ Summarize this dialog: A: Hi Tom, are you busy tomorrow’s afternoon? B: I’m pretty sure I am. What’s up? A: Can you go with me to the animal shelter?. B: What do you want to do? A: I want to get a puppy for my son. B: That will make him so happy. A: Yeah, we’ve discussed it many times. I think he’s ready now. B: That’s good. Raising a dog is a tough issue. Like having a baby ;-) A: I'll get him one of those little dogs. B: One that won't grow up too big;-) A: And eat too much;-)) B: Do you know which one he would like? A: Oh, yes, I took him there last Monday. He showed me one that he really liked. B: I bet you had to drag him away. A: He wanted to take it home right away ;-). B: I wonder what he'll name it. A: He said he’d name it after his dead hamster – Lemmy - he's a great Motorhead fan :-))) --- Summary: """ inputs = tokenizer(prompt, return_tensors="pt") input_ids = inputs["input_ids"].to(device) # tokens = self.tokenizer.convert_ids_to_tokens(input_ids[0]) # print(tokens) generation_config = GenerationConfig() with torch.no_grad(): generation_output = model.generate( input_ids=input_ids, generation_config=generation_config, # parameters below are set arbitrarily; a lot are just defaults return_dict_in_generate=True, output_scores=True, do_sample=True, temperature=0.3, top_p=0.85, top_k=40, num_beams=1, max_new_tokens=600, repetition_penalty=1.2, ) s = generation_output.sequences[0] run_output = tokenizer.decode(s) print("Run output:", run_output) return run_output