-
-
Save kinoc/2d636a68876cd3de7b6e9c9452b61089 to your computer and use it in GitHub Desktop.
| # So you want to run GPT-J-6B using HuggingFace+FastAPI on a local rig (3090 or TITAN) ... tricky. | |
| # special help from the Kolob Colab server https://colab.research.google.com/drive/1VFh5DOkCJjWIrQ6eB82lxGKKPgXmsO5D?usp=sharing#scrollTo=iCHgJvfL4alW | |
| # Conversion to HF format (12.6GB tar image) found at https://drive.google.com/u/0/uc?id=1NXP75l1Xa5s9K18yf3qLoZcR6p4Wced1&export=download | |
| # Uses GDOWN to get the image | |
| # You will need 26 GB of space, 12+GB for the tar and 12+GB expanded (you can nuke the tar after expansion) | |
| # Near Simplest Language model API, with room to expand! | |
| # runs GPT-J-6B on 3090 and TITAN and servers it using FastAPI | |
| # change "seq" (which is the context size) to adjust footprint | |
| # | |
| # JAX-based | |
| # seq vram usage | |
| # 512 14.7G | |
| # 900 15.3G | |
| # | |
| # HF-based | |
| # seq vram usage | |
| # 512 15.6 G | |
| # 900 --.- G | |
| # | |
| # uses FastAPI, so install that | |
| # https://fastapi.tiangolo.com/tutorial/ | |
| # pip install fastapi | |
| # pip install uvicorn[standard] | |
| # pip install git+https://github.com/finetuneanon/transformers@gpt-neo-localattention3 | |
| # pip install termcolor | |
| # #`pip install flask-ngrok | |
| # #`pip install flask_cloudflared | |
| # pip install pyngrok | |
| # pip install nest-asyncio | |
| # pip install gdown | |
| # gdown --id 1NXP75l1Xa5s9K18yf3qLoZcR6p4Wced1 --output ../j6b_ckpt.tar | |
| # (resutls 12.6GB [18:19], 11.4MB/s] | |
| # | |
| # note: for my setup I needed to perform symlink suggested ny myjr52 in https://github.com/google/jax/issues/5231 | |
| # https://pytorch.org/get-started/previous-versions/ | |
| # for cuda 10.1 | |
| # pip install torch==1.8.1+cu101 torchvision==0.9.1+cu101 torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html | |
| # for cuda 11.2 | |
| # pip install torch==1.8.1+cu112 torchvision==0.9.1+cu112 torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html | |
| # conda install python-multipart | |
| #-------------------------------------- | |
| #chek pyngrok — https://github.com/alexdlaird/pyngrok | |
| #install | |
| # pip install pyngrok | |
| # | |
| # Set up your ngrok Authtoken | |
| # ngrok authtoken xxxxxxxxxxxxx | |
| # GO: local execution | |
| # XLA_PYTHON_CLIENT_PREALLOCATE=false XLA_PYTHON_CLIENT_ALLOCATOR=platform CUDA_VISIBLE_DEVICES=0 python3 jserv_hf_fast.py | |
| # When done try | |
| # http://localhost:8051/docs#/default/read_completions_engines_completions_post | |
| # now you are in FastAPI + EleutherAI land | |
| # note: needs async on the read_completions otherwise jax gets upset | |
| # REMEMBER: adjust the location of the checkpoint image TAR_PATH | |
| # | |
| # Using plain HF instead of Jax so can comment out JAX related for this install | |
| # ----------------------------------------- | |
| # # uses https://github.com/kingoflolz/mesh-transformer-jax | |
| # # so install jax on your system so recommend you get it working with your GPU first | |
| # # !apt install zstd | |
| # | |
| # # | |
| # # the "slim" version contain only bf16 weights and no optimizer parameters, which minimizes bandwidth and memory | |
| # # wget https://the-eye.eu/public/AI/GPT-J-6B/step_383500_slim.tar.zstd | |
| # # tar -I zstd -xf step_383500_slim.tar.zstd | |
| # # git clone https://github.com/kingoflolz/mesh-transformer-jax.git | |
| # # pip install -r mesh-transformer-jax/requirements.txt | |
| # # jax 0.2.12 is required due to a regression with xmap in 0.2.13 | |
| # # pip install mesh-transformer-jax/ jax==0.2.12 | |
| # # I have cuda 10.1 and python 3.9 so had to update | |
| # # pip3 install --upgrade "https://storage.googleapis.com/jax-releases/cuda101/jaxlib-0.1.66+cuda101-cp39-none-manylinux2010_x86_64.whl" | |
| # ----------------------------------------- | |
| # | |
| # Started 2021-06-19 (USA Juneteenth) and released to freedom under MIT | |
| # | |
| from termcolor import colored | |
| #from flask import Flask, redirect, url_for, request | |
| import json | |
| import torch | |
| import requests | |
| import subprocess | |
| import tarfile | |
| import os | |
| import re | |
| import time | |
| from threading import Timer | |
| from typing import Optional | |
| from typing import Dict | |
| from fastapi import FastAPI,Request,Body | |
| import uvicorn | |
| import nest_asyncio | |
| from pyngrok import ngrok | |
| import threading | |
| import numpy as np | |
| import transformers | |
| from transformers import GPTNeoForCausalLM, AutoConfig,AutoTokenizer,GPT2Tokenizer | |
| print(colored("Server Initialization ...", "magenta")) | |
| connect_method = "Ngrok" #@param ["Ngrok", "Cloudflare"] | |
| #if connect_method == "Cloudflare": | |
| # from flask_cloudflared import run_with_cloudflared | |
| #elif connect_method == "Ngrok": | |
| # from flask_ngrok import run_with_ngrok | |
| model = None | |
| tokenizer = None | |
| #------------------------------------------ | |
| # REMEMBER: Change these settings to local values | |
| active_model='' | |
| runtime_gpu="cuda:0" | |
| training_gpu="cuda:0" | |
| TAR_PATH ="../" | |
| check_point_dir="../j6b_ckpt" | |
| SERVER_PORT = 9995 | |
| NGROK_AUTH_TOKEN ="xxxxxxxxx" | |
| #----------------------------------------- | |
| #https://stackoverflow.com/questions/48152674/how-to-check-if-pytorch-is-using-the-gpu | |
| report_color ="green" | |
| if (not torch.cuda.is_available()): report_color="red" | |
| print(colored(" torch.cuda.is_available() = "+str(torch.cuda.is_available()), report_color)) | |
| print(colored(" torch.cuda.current_device() = "+str(torch.cuda.current_device()), report_color)) | |
| print(colored(" torch.cuda.device_count() = "+str(torch.cuda.device_count()), report_color)) | |
| print(colored(" torch.cuda.get_device_name(0) = "+str(torch.cuda.get_device_name()), report_color)) | |
| print(colored(" Mem Allocated:{}GB".format(round(torch.cuda.memory_allocated(0)/1024**3,1)), report_color)) | |
| print(colored(" Mem Cached: {}GB".format(round(torch.cuda.memory_reserved(0)/1024**3,1)), report_color)) | |
| # Set path to tar file and unpack it | |
| model_on_drive = TAR_PATH +"j6b_ckpt.tar" | |
| print(colored("Checking j6b_ckpt ...", "magenta")) | |
| print(colored(" TAR_PATH ={}".format(TAR_PATH),"green")) | |
| print(colored(" check_point_dir ={}".format(check_point_dir),"green")) | |
| print(colored(" model_on_drive ={}".format(model_on_drive),"green")) | |
| if (not os.path.isdir(check_point_dir)): | |
| print(colored("Unpacking tar file, please wait...", "magenta")) | |
| tar = tarfile.open(model_on_drive, "r") | |
| tar.extractall() | |
| tar.close() | |
| else: | |
| print( colored("Expanded Checkpoint directory found", "green") ) | |
| # Initialize the model | |
| print(colored("Initializing model, please wait...", "magenta")) | |
| config = AutoConfig.from_pretrained("EleutherAI/gpt-neo-2.7B") | |
| config.attention_layers = ["global"] * 28 | |
| config.attention_types = [["global"], 28] | |
| config.num_layers = 28 | |
| config.num_heads = 16 | |
| config.hidden_size = 256 * config.num_heads | |
| config.vocab_size = 50400 | |
| config.rotary = True | |
| config.rotary_dim = 64 | |
| config.jax = True | |
| try: | |
| from collections.abc import MutableMapping | |
| except ImportError: | |
| from collections import MutableMapping | |
| from pathlib import Path | |
| class Checkpoint(MutableMapping): | |
| def __init__(self, chkpt_dir, device="cpu"): | |
| self.device = device | |
| self.chkpt_dir = Path(chkpt_dir) | |
| self.checkpoint = torch.load(str(chkpt_dir / Path("m.pt"))) | |
| def __len__(self): | |
| return len(self.checkpoint) | |
| def __getitem__(self, key): | |
| path = self.chkpt_dir / Path(self.checkpoint[key]).name | |
| return torch.load(str(path), map_location=self.device) | |
| def __setitem__(self, key, value): | |
| return | |
| def __delitem__(self, key, value): | |
| return | |
| def keys(self): | |
| return self.checkpoint.keys() | |
| def __iter__(self): | |
| for key in self.checkpoint: | |
| yield (key, self.__getitem__(key)) | |
| def __copy__(self): | |
| return Checkpoint(self.chkpt_dir, device=self.device) | |
| def copy(self): | |
| return Checkpoint(self.chkpt_dir, device=self.device) | |
| def infer(context, top_k=40, top_p=0.9, temp=1.0, gen_len=512,repetition_penalty=1): | |
| start = time.time() | |
| tokens = tokenizer(context, return_tensors="pt").input_ids | |
| ids = tokens.cuda() | |
| start = time.time() | |
| #output = network.generate(batched_tokens, length, gen_len, {"top_p": np.ones(total_batch) * top_p, "temp": np.ones(total_batch) * temp}) | |
| output = model.generate(ids, | |
| do_sample=True, | |
| min_length=gen_len, | |
| max_length=gen_len, | |
| temperature=temp, | |
| use_cache=True, | |
| top_p= top_p, | |
| repetition_penalty =1.5, | |
| no_repeat_ngram_size=6, | |
| max_time=60 | |
| ) | |
| samples = [] | |
| for i,out_seq in enumerate(output): | |
| samples.append(tokenizer.decode(out_seq, skip_special_tokens=True)) | |
| #for o in decoded_tokens[:, :, 0]: | |
| # samples.append(tokenizer.decode(o)) | |
| print(colored(f"completion done in {time.time() - start:06}s","green")) | |
| return samples | |
| def recursive_infer(initial_context, current_context=None, top_k=40, top_p=0.9, temp=1.0, gen_len=256, depth=0, max_depth=5,recursive_refresh=0,repetition_penalty=1): | |
| lcc=0 | |
| ic = initial_context | |
| cc = '' | |
| if current_context : | |
| lcc = len(current_context) | |
| cc = current_context | |
| print (colored("ENTER recursive_infer:{} {} {} {}".format(len(initial_context),lcc,depth,max_depth),"red")) | |
| print (colored(" in_cc:{}".format(cc),"cyan")) | |
| c='' | |
| if not current_context : | |
| c = initial_context | |
| else: | |
| if (recursive_refresh == 1): | |
| c= initial_context + "\r\n ... \r\n" | |
| c = c + current_context | |
| print (colored("loc_c:{}".format(c),"yellow")) | |
| loc_len = gen_len + (len(c) / 3) | |
| i = infer( c, top_k, top_p, temp, gen_len,repetition_penalty)[0] | |
| #yield i[len(c):] | |
| #yield i | |
| loc_ans = i[len(c):] | |
| print (colored(" loc_i:{}".format(i),"white")) | |
| print (colored(" loc_ans:{}".format(loc_ans),"white")) | |
| if depth >= max_depth: return '' | |
| #yield from recursive_infer(initial_context, i[len(c):],top_k, top_p, temp, gen_len, depth+1, max_depth,recursive_refresh,repetition_penalty) | |
| recursive_ans = recursive_infer(initial_context, str(loc_ans),top_k, top_p, temp, gen_len, depth+1, max_depth,recursive_refresh,repetition_penalty) | |
| returned_ans = str(loc_ans +' '+ recursive_ans) | |
| print (colored(" returned_ans:{}".format(returned_ans),"cyan")) | |
| print (colored("EXIT recursive_infer:{} {} {} {}".format(len(initial_context),lcc,depth,max_depth),"red")) | |
| return returned_ans | |
| #model = GPTNeoForCausalLM.from_pretrained(pretrained_model_name_or_path=None, config=config, state_dict=Checkpoint()) | |
| print(colored("loading GPTNeoForCausalLM.from_pretrained","magenta")) | |
| print(colored(" loading from {}".format(check_point_dir),"green")) | |
| model = GPTNeoForCausalLM.from_pretrained(pretrained_model_name_or_path=None, config=config, state_dict=Checkpoint(check_point_dir)) | |
| print(colored("loading GPT2Tokenizer.from_pretrained","magenta")) | |
| #tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B") | |
| # Initialize the tokenizer and set up the bad_words_ids to exclude Author's Note tags | |
| tokenizer = AutoTokenizer.from_pretrained("gpt2") | |
| vocab = tokenizer.get_vocab() | |
| vocab_keys = vocab.keys() | |
| find_keys = lambda char : [key for key in vocab_keys if key.find(char) != -1] | |
| bad_words = [] | |
| bad_words_ids = [] | |
| bad_words.extend(find_keys("[")) | |
| bad_words.extend(find_keys(" [")) | |
| bad_words.extend(find_keys("<|endoftext|>")) | |
| for key in bad_words: | |
| bad_id = vocab[key] | |
| bad_words_ids.append([bad_id]) | |
| print(colored(" move to GPU","magenta")) | |
| model.to(runtime_gpu) | |
| print(colored(" >>>> DONE! <<<<", "green")) | |
| print(colored("PRETEST: warming up processing pipeline","magenta")) | |
| #warms up the processing on startup | |
| pre_prompt = "I am the EleutherAI / GPT-J-6B based AI language model server. I will" | |
| print (colored("PROMPT:"+pre_prompt,"yellow")) | |
| print(colored(infer(pre_prompt)[0],"cyan")) | |
| # app = Flask(__name__) | |
| app = FastAPI() | |
| #if connect_method == "Cloudflare": | |
| # run_with_cloudflared(app) | |
| #elif connect_method == "Ngrok": | |
| # run_with_ngrok(app) | |
| @app.route("/") | |
| def home(): | |
| return "<h1>EleutherAI J6B Service Running!</h1>" | |
| @app.route('/request',methods = ['POST']) | |
| def koboldrequest(request: Request=None): | |
| if request.method == 'POST': | |
| try: | |
| #clear_output() | |
| js = request.json | |
| txt = js["text"] | |
| min = js["min"] | |
| max = js["max"] | |
| rep_pen = js["rep_pen"] | |
| temp = js["temperature"] | |
| top_p = js["top_p"] | |
| # Compatability with un-updated clients | |
| if("numseqs" in js): | |
| numseqs = js["numseqs"] | |
| else: | |
| numseqs = 1 | |
| if("retfultxt" in js): | |
| retfultxt = js["retfultxt"] | |
| else: | |
| retfultxt = True | |
| print(colored("Received Data: {0}".format(txt), "yellow")) | |
| torch.cuda.empty_cache() | |
| print(colored("Generating text, please wait...", "green")) | |
| tokens = tokenizer(txt, return_tensors="pt").input_ids.to("cpu") | |
| ids = tokens.cuda() | |
| gen_tokens = model.generate( | |
| ids.long().cuda(), | |
| do_sample=True, | |
| min_length=min, | |
| max_length=max, | |
| temperature=temp, | |
| top_p = top_p, | |
| repetition_penalty = rep_pen, | |
| use_cache=True, | |
| bad_words_ids=bad_words_ids, | |
| num_return_sequences=numseqs | |
| ).long() | |
| genout = [] | |
| for tkns in gen_tokens: | |
| if(not retfultxt): | |
| # Strip context tokens out of returned sequences | |
| dif = (len(tkns) - len(tokens[0])) * -1 | |
| tkns = tkns[dif:] | |
| tkns = list(filter(lambda a: a != 50256, tkns)) | |
| genout.append(tokenizer.decode(tkns)) | |
| torch.cuda.empty_cache() | |
| if(len(genout) > 0 and genout[0] != ""): | |
| if(retfultxt): | |
| # Outdated client, send old JSON format | |
| print(colored("Generated Text: {0}".format(genout[0]), "cyan")) | |
| response = app.response_class( | |
| response=json.dumps({"data": {"text": genout[0]}}), | |
| status=200, | |
| mimetype='application/json' | |
| ) | |
| else: | |
| # New client format with numseq support | |
| i = 0 | |
| for seq in genout: | |
| print(colored("[Result {0}]\n{1}".format(i, seq), "cyan")) | |
| i += 1 | |
| response = app.response_class( | |
| response=json.dumps({"data": {"seqs": genout}}), | |
| status=200, | |
| mimetype='application/json' | |
| ) | |
| return response | |
| else: | |
| print(colored("[ERROR] Something went wrong during generation!", "red")) | |
| response = app.response_class( | |
| response=json.dumps({"error": {"extensions": {"code": "Something went wrong during generation!"}}}), | |
| status=400, | |
| mimetype='application/json' | |
| ) | |
| js = {} | |
| tokens = [] | |
| ids = [] | |
| gen_tokens = [] | |
| genout = "" | |
| response = {} | |
| except Exception as e: | |
| print(colored("[ERROR] Something went wrong during generation!", "red")) | |
| print(colored("{0}".format(e), "red")) | |
| response = app.response_class( | |
| response=json.dumps({"error": {"extensions": {"code": "Something went wrong during generation! {0}".format(e)}}}), | |
| status=400, | |
| mimetype='application/json' | |
| ) | |
| @app.post("/engines/completions") | |
| async def read_completions( | |
| #engine_id:str, | |
| prompt:Optional[str] = None, | |
| max_tokens: Optional[int]=16, | |
| temperature: Optional[float]=1.0, | |
| top_p:Optional[float]=1.0, | |
| top_k:Optional[int]=40, | |
| n:Optional[int]=1, | |
| stream:Optional[bool]=False, | |
| logprobs:Optional[int]=None, | |
| echo:Optional[bool]=False, | |
| stop:Optional[list]=None, | |
| presence_penalty:Optional[float]=0.0001, | |
| repetition_penalty:Optional[float]=1.0000, | |
| best_of:Optional[int]=1, | |
| recursive_depth:Optional[int]=0, | |
| recursive_refresh:Optional[int]=0, | |
| logit_bias:Optional[Dict[str,float]]=None, | |
| request: Request=None | |
| ): | |
| global active_model,model,tokenizer | |
| response={} | |
| response['params']= dict(request.query_params) | |
| print(response) | |
| text = str(prompt) | |
| text = text.replace("|","\r\n") | |
| prompt_len = len(text) | |
| ids = tokenizer(text, return_tensors="pt").input_ids.to(runtime_gpu) | |
| max_length = max_tokens + ids.shape[1] | |
| do_sample=True | |
| use_cache=True | |
| start = time.time() | |
| num_return_sequences=n | |
| num_beams = n | |
| num_beam_groups=n | |
| if (recursive_depth== 0): | |
| gen_tokens = model.generate( | |
| ids, | |
| do_sample=True, | |
| min_length=max_length, | |
| max_length=max_length, | |
| temperature=temperature, | |
| use_cache=True, | |
| num_beams = num_beams, | |
| num_return_sequences=num_return_sequences, | |
| # num_beam_groups=num_beam_groups, | |
| # early_stopping=True, | |
| top_p=top_p, | |
| # top_k=50, | |
| repetition_penalty =repetition_penalty, | |
| no_repeat_ngram_size=6, | |
| max_time=60 | |
| ) | |
| else: | |
| gen_tokens = [] | |
| # do it serial until we figure out parallel for recursive | |
| for x in range(num_return_sequences): | |
| ref_text = str(text) | |
| gen_tokens.append( recursive_infer(initial_context=str(ref_text), | |
| current_context=None, | |
| top_p=top_p,top_k=top_k, temp=temperature, | |
| gen_len=max_length, | |
| depth=0, | |
| max_depth = recursive_depth, | |
| recursive_refresh=recursive_refresh, | |
| repetition_penalty=repetition_penalty | |
| )) | |
| last_prompt=text | |
| choices=[] | |
| gen_text='' | |
| for i,out_seq in enumerate(gen_tokens): | |
| choice={} | |
| choice['prompt']=last_prompt | |
| if (recursive_depth== 0): | |
| choice['text']=tokenizer.decode(out_seq, skip_special_tokens=True) | |
| else: | |
| choice['text']=out_seq | |
| choice['index']=i | |
| choice['logprobs']=None | |
| choice['finish_reason']='length' | |
| choices.append(choice) | |
| print("GenText[{}]:{}".format(i,choice['text'])) | |
| gen_text = gen_text + choice['text'] | |
| if (recursive_depth==0): | |
| last_prompt = text | |
| else: | |
| last_prompt = text | |
| #last_prompt = out_seq | |
| #if (recursive_refresh==1): | |
| # last_prompt = text +"\r\n ... \r\n"+out_seq | |
| #gen_text = tokenizer.batch_decode(gen_tokens)[0] | |
| fin = time.time() | |
| elapsed = fin - start | |
| cps = (len(gen_text)-prompt_len) / elapsed | |
| print("elapsed:{} len:{} cps:{}".format(elapsed,len(gen_text),cps)) | |
| response['id']='' | |
| response['object']='text_completion' | |
| response['created']='' | |
| response['model']= 'GPT-J-6B_HF' #args.model | |
| response['choices']=choices | |
| return(response) | |
| print(colored("Model startup complete! Starting web service....", "green")) | |
| # Setting an auth token allows us to open multiple | |
| # tunnels at the same time | |
| if (NGROK_AUTH_TOKEN is not None) and not ("xxxxxx" in NGROK_AUTH_TOKEN ) : | |
| ngrok.set_auth_token(NGROK_AUTH_TOKEN) | |
| public_url = ngrok.connect(SERVER_PORT) | |
| print(colored("Public_URL = "+str(public_url), "cyan")) | |
| nest_asyncio.apply() | |
| #app.run() | |
| #if __name__ == "__main__": | |
| print(colored("Ready to Serve!", "green")) | |
| uvicorn.run(app, host="0.0.0.0", port=SERVER_PORT) | |
| print (colored("Happy Service!", "green")) | |
| # http://localhost:9995/docs#/default/read_completions_engines_completions_post | |
| # http://<NGROK_URL_ID>.ngrok.io/docs#/default/read_completions_engines_completions_post | |
| # http://<NGROK_URL_ID>.ngrok.io/docs#/default/koboldrequest_request_post | |
awesome!!!
loading from ../j6b_ckpt
Traceback (most recent call last):
File "jserv-hf.py", line 287, in
model = GPTNeoForCausalLM.from_pretrained(pretrained_model_name_or_path=None, config=config, state_dict=Checkpoint(check_point_dir))
File "jserv-hf.py", line 201, in init
self.checkpoint = torch.load(str(chkpt_dir / Path("m.pt")))
File "/home/jp/miniconda3/envs/torch/lib/python3.8/site-packages/torch/serialization.py", line 594, in load
with _open_file_like(f, 'rb') as opened_file:
File "/home/jp/miniconda3/envs/torch/lib/python3.8/site-packages/torch/serialization.py", line 230, in _open_file_like
return _open_file(name_or_buffer, mode)
File "/home/jp/miniconda3/envs/torch/lib/python3.8/site-packages/torch/serialization.py", line 211, in init
super(_open_file, self).init(open(name, mode))
FileNotFoundError: [Errno 2] No such file or directory: '../j6b_ckpt/m.pt'
Needed to use
@app.api_routefor some reason