|
|
@@ -0,0 +1,123 @@ |
|
|
from modal import Image, Stub, Secret, gpu |
|
|
from pathlib import Path |
|
|
import os |
|
|
|
|
|
MODEL_PATH = "/model" |
|
|
|
|
|
def download_models(): |
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
token = os.environ["HUGGINGFACE_TOKEN"] |
|
|
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", use_auth_token=token) |
|
|
tokenizer.save_pretrained(MODEL_PATH) |
|
|
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", use_auth_token=token) |
|
|
model.save_pretrained(MODEL_PATH) |
|
|
|
|
|
|
|
|
## adapted from https://github.com/modal-labs/doppel-bot/blob/main/src/common.py |
|
|
# versions might be out of date |
|
|
llama2_image = ( |
|
|
Image.micromamba() |
|
|
.micromamba_install( |
|
|
"cudatoolkit=11.7", |
|
|
"cudnn=8.1.0", |
|
|
"cuda-nvcc", |
|
|
channels=["conda-forge", "nvidia"], |
|
|
) |
|
|
.apt_install("git") |
|
|
.pip_install( |
|
|
"accelerate==0.18.0", |
|
|
"bitsandbytes==0.37.0", |
|
|
"bitsandbytes-cuda117==0.26.0.post2", |
|
|
"datasets==2.10.1", |
|
|
"fire==0.5.0", |
|
|
"gradio==3.23.0", |
|
|
"peft @ git+https://github.com/huggingface/peft.git@e536616888d51b453ed354a6f1e243fecb02ea08", |
|
|
"transformers @ git+https://github.com/huggingface/transformers.git@a92e0ad2e20ef4ce28410b5e05c5d63a5a304e65", |
|
|
"torch==2.0.0", |
|
|
"torchvision==0.15.1", |
|
|
"sentencepiece==0.1.97", |
|
|
) |
|
|
.run_function(download_models, memory=32768, secret=Secret.from_name("hugging-face"), timeout=3600) |
|
|
) |
|
|
|
|
|
stub = Stub(name="llama2", image=llama2_image) |
|
|
|
|
|
@stub.function( |
|
|
gpu=gpu.A100(memory=40), |
|
|
) |
|
|
def main(): |
|
|
""" |
|
|
run this function: modal run modal_llama2::main |
|
|
prereqs: |
|
|
- modal hugging-face secret must be configured correctly |
|
|
- you must have access to https://huggingface.co/meta-llama/Llama-2-7b-hf (request access on hugging face & https://ai.meta.com/resources/models-and-libraries/llama-downloads; took me ~1 hr to get approved) |
|
|
|
|
|
first run will need to create the image, which takes ~20 mins (downloading ~30 GB from huggingface) |
|
|
later runs take < 1 min |
|
|
see this notebook for other things you can do with the model: https://github.com/facebookresearch/llama-recipes/blob/main/quickstart.ipynb |
|
|
""" |
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaForCausalLM |
|
|
import torch |
|
|
load_8bit = False |
|
|
device = "cuda" |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) |
|
|
model: LlamaForCausalLM = AutoModelForCausalLM.from_pretrained( |
|
|
MODEL_PATH, |
|
|
load_in_8bit=load_8bit, |
|
|
torch_dtype=torch.float16, |
|
|
device_map="auto", |
|
|
) |
|
|
model.eval() |
|
|
from transformers import GenerationConfig |
|
|
|
|
|
# prompt copied from https://github.com/facebookresearch/llama-recipes/blob/main/quickstart.ipynb |
|
|
prompt = """ |
|
|
Summarize this dialog: |
|
|
A: Hi Tom, are you busy tomorrow’s afternoon? |
|
|
B: I’m pretty sure I am. What’s up? |
|
|
A: Can you go with me to the animal shelter?. |
|
|
B: What do you want to do? |
|
|
A: I want to get a puppy for my son. |
|
|
B: That will make him so happy. |
|
|
A: Yeah, we’ve discussed it many times. I think he’s ready now. |
|
|
B: That’s good. Raising a dog is a tough issue. Like having a baby ;-) |
|
|
A: I'll get him one of those little dogs. |
|
|
B: One that won't grow up too big;-) |
|
|
A: And eat too much;-)) |
|
|
B: Do you know which one he would like? |
|
|
A: Oh, yes, I took him there last Monday. He showed me one that he really liked. |
|
|
B: I bet you had to drag him away. |
|
|
A: He wanted to take it home right away ;-). |
|
|
B: I wonder what he'll name it. |
|
|
A: He said he’d name it after his dead hamster – Lemmy - he's a great Motorhead fan :-))) |
|
|
--- |
|
|
Summary: |
|
|
""" |
|
|
|
|
|
inputs = tokenizer(prompt, return_tensors="pt") |
|
|
input_ids = inputs["input_ids"].to(device) |
|
|
# tokens = self.tokenizer.convert_ids_to_tokens(input_ids[0]) |
|
|
# print(tokens) |
|
|
generation_config = GenerationConfig() |
|
|
|
|
|
with torch.no_grad(): |
|
|
generation_output = model.generate( |
|
|
input_ids=input_ids, |
|
|
generation_config=generation_config, |
|
|
# parameters below are set arbitrarily; a lot are just defaults |
|
|
return_dict_in_generate=True, |
|
|
output_scores=True, |
|
|
do_sample=True, |
|
|
temperature=0.3, |
|
|
top_p=0.85, |
|
|
top_k=40, |
|
|
num_beams=1, |
|
|
max_new_tokens=600, |
|
|
repetition_penalty=1.2, |
|
|
) |
|
|
|
|
|
s = generation_output.sequences[0] |
|
|
run_output = tokenizer.decode(s) |
|
|
print("Run output:", run_output) |
|
|
return run_output |