Forked from ChrisHayduk/merge_qlora_with_quantized_model.py
Created
October 25, 2023 05:59
-
-
Save pavp-git/ee56c35f4c32cfec74feedfbb99d8cd0 to your computer and use it in GitHub Desktop.
Revisions
-
ChrisHayduk revised this gist
Sep 29, 2023 . 1 changed file with 2 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -29,7 +29,7 @@ def save_model(model, tokenizer, to): with open(os.path.join(to, 'config.json'), 'w') as config: config.write(json.dumps(config_data, indent=2)) def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfloat16, device="cpu"): """ 'model': the peftmodel you loaded with qlora. 'tokenizer': the model's corresponding hf's tokenizer. @@ -88,7 +88,7 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfl load_in_4bit=True, torch_dtype=torch.bfloat16, quantization_config=quantization_config, device_map="auto" ) print(model) tok = LlamaTokenizer.from_pretrained(model_path) -
ChrisHayduk revised this gist
Sep 24, 2023 . 1 changed file with 12 additions and 10 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -19,6 +19,16 @@ import gc import copy def save_model(model, tokenizer, to): print(f"Saving dequantized model to {to}...") model.save_pretrained(to) tokenizer.save_pretrained(to) config_data = json.loads(open(os.path.join(to, 'config.json'), 'r').read()) config_data.pop("quantization_config", None) config_data.pop("pretraining_tp", None) with open(os.path.join(to, 'config.json'), 'w') as config: config.write(json.dumps(config_data, indent=2)) def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfloat16, device="cuda"): """ 'model': the peftmodel you loaded with qlora. @@ -55,14 +65,7 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfl model.is_loaded_in_4bit = False save_model(model, tokenizer, to) return model @@ -104,8 +107,7 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfl # Note that the output folder here should be different than the one you used for dequantize_model # This save will output the model merged with LoRA weights save_model(model, tok, "put-output-folder-here") print(f"Successfully saved merged model {model_path} to disk") -
ChrisHayduk revised this gist
Sep 11, 2023 . 1 changed file with 0 additions and 4 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -53,8 +53,6 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfl parent, target, target_name = _get_submodules(model, name) setattr(parent, target_name, new_module) model.is_loaded_in_4bit = False print("Saving dequantized model...") @@ -69,8 +67,6 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfl return model model_path = 'Huggingface-base-model/path-goes-here' adapter_path = 'Huggingface-adapter/path-goes-here' -
ChrisHayduk revised this gist
Sep 11, 2023 . 1 changed file with 2 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -71,7 +71,7 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfl model_path = 'Huggingface-base-model/path-goes-here' adapter_path = 'Huggingface-adapter/path-goes-here' quantization_config=BitsAndBytesConfig( @@ -96,7 +96,7 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfl # Note: This function outputs the dequantized model without merging the adapter yet # The code below it will merge the adapter and then save it to disk model = dequantize_model(model, tok, to='output-folder-for-dequantized-model-here') print(model) model = PeftModel.from_pretrained(model = model, model_id = adapter_path) -
ChrisHayduk revised this gist
Sep 11, 2023 . 1 changed file with 2 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -110,6 +110,8 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfl # This save will output the model merged with LoRA weights model.save_pretrained("put-output-folder-here") tokenizer.save_pretrained("put-output-folder-here") print(f"Successfully saved merged model {model_path} to disk") except Exception as e: print(f"An error occurred: {e}") -
ChrisHayduk revised this gist
Sep 11, 2023 . 1 changed file with 9 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -93,14 +93,23 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfl ) print(model) tok = LlamaTokenizer.from_pretrained(model_path) # Note: This function outputs the dequantized model without merging the adapter yet # The code below it will merge the adapter and then save it to disk model = dequantize_model(model, tok, to='/content/drive/MyDrive/QuerySurge AI/dequantized_model') print(model) model = PeftModel.from_pretrained(model = model, model_id = adapter_path) print(model) model = model.merge_and_unload() print(model) print(f"Successfully loaded the model {model_path} into memory") # Note that the output folder here should be different than the one you used for dequantize_model # This save will output the model merged with LoRA weights model.save_pretrained("put-output-folder-here") tokenizer.save_pretrained("put-output-folder-here") except Exception as e: print(f"An error occurred: {e}") -
ChrisHayduk revised this gist
Aug 29, 2023 . 1 changed file with 0 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,7 +1,5 @@ """ The code below combines approaches published by both @eugene-yh and @jinyongyoo on Github. Thanks for the contributions guys! -
ChrisHayduk revised this gist
Aug 29, 2023 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -2,7 +2,7 @@ NOTE: The code below combines approaches published by both @eugene-yh and @jinyongyoo on Github. Thanks for the contributions guys! -
ChrisHayduk revised this gist
Aug 29, 2023 . 1 changed file with 7 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,5 +1,11 @@ """ NOTE: The code below combines approaches published both @eugene-yh and @jinyongyoo on Github. Thanks for the contributions guys! """ import torch -
ChrisHayduk revised this gist
Aug 29, 2023 . 1 changed file with 5 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,3 +1,7 @@ """ The code below combines approaches published both @eugene-yh and @jinyongyoo on Github. Thanks for the contributions guys! """ import torch import peft import json @@ -64,7 +68,7 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfl model_path = 'NousResearch/Llama-2-13b-hf' adapter_path = 'Huggingface-adapter/path-goes-here' quantization_config=BitsAndBytesConfig( load_in_4bit=True, -
ChrisHayduk revised this gist
Aug 25, 2023 . 1 changed file with 0 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -36,8 +36,6 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfl quant_state[2] = dtype weights = dequantize_4bit(module.weight.data, quant_state=quant_state, quant_type="nf4").to(dtype) new_module = torch.nn.Linear(module.in_features, module.out_features, bias=None, dtype=dtype) -
ChrisHayduk revised this gist
Aug 25, 2023 . 1 changed file with 14 additions and 6 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -9,13 +9,15 @@ from peft import PeftModel from transformers import AutoModelForCausalLM, LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig, CodeLlamaTokenizer import gc import copy def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfloat16, device="cuda"): """ 'model': the peftmodel you loaded with qlora. 'tokenizer': the model's corresponding hf's tokenizer. 'to': directory to save the dequantized model 'dtype': dtype that the model was trained using 'device': device to load the model to """ # Delete the model object if it exists @@ -30,11 +32,17 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo for name, module in model.named_modules(): if isinstance(module, cls): print(f"Dequantizing `{name}`...") quant_state = copy.deepcopy(module.weight.quant_state) quant_state[2] = dtype print(quant_state) weights = dequantize_4bit(module.weight.data, quant_state=quant_state, quant_type="nf4").to(dtype) new_module = torch.nn.Linear(module.in_features, module.out_features, bias=None, dtype=dtype) new_module.weight = torch.nn.Parameter(weights) new_module.to(device=device, dtype=dtype) parent, target, target_name = _get_submodules(model, name) setattr(parent, target_name, new_module) -
ChrisHayduk revised this gist
Aug 25, 2023 . 1 changed file with 9 additions and 18 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -12,7 +12,7 @@ import torch def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.float16, device="cuda"): """ 'model': the peftmodel you loaded with qlora. 'tokenizer': the model's corresponding hf's tokenizer. @@ -32,9 +32,9 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo print(f"Dequantizing `{name}`...") weights = dequantize_4bit(module.weight.data, quant_state=module.weight.quant_state, quant_type="nf4") new_module = torch.nn.Linear(module.in_features, module.out_features, bias=None) new_module.weight = torch.nn.Parameter(weights) new_module.to(device) parent, target, target_name = _get_submodules(model, name) setattr(parent, target_name, new_module) @@ -56,11 +56,6 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo model_path = 'NousResearch/Llama-2-13b-hf' adapter_path = 'ChrisHayduk/QuerySurge-AI' @@ -82,20 +77,16 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo quantization_config=quantization_config, device_map={"": 0} ) print(model) tok = LlamaTokenizer.from_pretrained(model_path) model = dequantize_model(model, tok, to='/content/drive/MyDrive/QuerySurge AI/dequantized_model') print(model) model = PeftModel.from_pretrained(model = model, model_id = adapter_path) print(model) model = model.merge_and_unload() print(model) print(f"Successfully loaded the model {model_path} into memory") except Exception as e: print(f"An error occurred: {e}") -
ChrisHayduk revised this gist
Aug 25, 2023 . No changes.There are no files selected for viewing
-
ChrisHayduk revised this gist
Aug 25, 2023 . 1 changed file with 24 additions and 44 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -10,13 +10,14 @@ from transformers import AutoModelForCausalLM, LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig, CodeLlamaTokenizer import gc import torch def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.float16): """ 'model': the peftmodel you loaded with qlora. 'tokenizer': the model's corresponding hf's tokenizer. """ # Delete the model object if it exists if os.path.exists(to): shutil.rmtree(to) @@ -25,10 +26,8 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo cls = bnb.nn.Linear4bit with torch.no_grad(): for name, module in model.named_modules(): if isinstance(module, cls): print(f"Dequantizing `{name}`...") weights = dequantize_4bit(module.weight.data, quant_state=module.weight.quant_state, quant_type="nf4") @@ -37,27 +36,34 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo new_module.weight = torch.nn.Parameter(weights) new_module.bias = torch.nn.Parameter(module.bias) parent, target, target_name = _get_submodules(model, name) setattr(parent, target_name, new_module) # a hack, setting this to avoid hf's saving error because hf # itself does not support saving a model that is registered to be loaded in 4bit. model.is_loaded_in_4bit = False print("Saving dequantized model...") model.save_pretrained(to) tokenizer.save_pretrained(to) config_data = json.loads(open(os.path.join(to, 'config.json'), 'r').read()) config_data.pop("quantization_config", None) config_data.pop("pretraining_tp", None) with open(os.path.join(to, 'config.json'), 'w') as config: config.write(json.dumps(config_data, indent=2)) return model import torch from peft import PeftModel from transformers import AutoModelForCausalLM, LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig, CodeLlamaTokenizer import gc model_path = 'NousResearch/Llama-2-13b-hf' adapter_path = 'ChrisHayduk/QuerySurge-AI' quantization_config=BitsAndBytesConfig( load_in_4bit=True, @@ -77,24 +83,19 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo device_map={"": 0} ) tok = LlamaTokenizer.from_pretrained(model_path) print(f"Successfully loaded the model {model_path} into memory") print(f"Dequantizing the model") model = dequantize_model(model, tok, to='./dequantized_model') print(f"Successfully dequantized the model") print(f"Loading and merging the adapter {adapter_path}") model = PeftModel.from_pretrained(model = model, model_id = adapter_path) model = model.merge_and_unload() print(f"Successfully loaded and merged the adapter {adapter_path}") except Exception as e: print(f"An error occurred: {e}") @@ -109,25 +110,4 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo # Run the garbage collection gc.collect() print("Model, GPU cache, and garbage have been cleared.") -
ChrisHayduk revised this gist
Aug 25, 2023 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -116,7 +116,7 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo adapter_path = 'Example/Adapter-Path' try: print(f"Starting to load the model {model_path} with adapter {adapter_path} into memory") model = LlamaForCausalLM.from_pretrained( model_path, -
ChrisHayduk revised this gist
Aug 25, 2023 . 1 changed file with 34 additions and 4 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -58,7 +58,6 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo model_path = 'NousResearch/Llama-2-13b-hf' quantization_config=BitsAndBytesConfig( load_in_4bit=True, @@ -82,10 +81,20 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo tok = LlamaTokenizer.from_pretrained(model_path) model = dequantize_model(model, tok) print(f"Successfully loaded the model {model_path} into memory") # Delete the model object if it exists if 'model' in locals(): del model # Clear the GPU cache torch.cuda.empty_cache() # Run the garbage collection gc.collect() print("Model, GPU cache, and garbage have been cleared.") except Exception as e: print(f"An error occurred: {e}") @@ -100,4 +109,25 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo # Run the garbage collection gc.collect() print("Model, GPU cache, and garbage have been cleared.") #Now reload model into memory model_path = './dequantized_model' adapter_path = 'Example/Adapter-Path' try: print(f"Starting to load the model {model_path} into memory") model = LlamaForCausalLM.from_pretrained( model_path, torch_dtype=torch.bfloat16, device_map={"": 0} ) print(model.model) tok = LlamaTokenizer.from_pretrained(model_path) model = PeftModel.from_pretrained(model = model, model_id = adapter_path) model = model.merge_and_unload() print(f"Successfully loaded the model {model_path} and adapter {adapter_path} into memory") -
ChrisHayduk revised this gist
Aug 25, 2023 . 1 changed file with 11 additions and 14 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,4 +1,14 @@ import torch import peft import json import shutil from peft.utils import _get_submodules import os import bitsandbytes as bnb from bitsandbytes.functional import dequantize_4bit from peft import PeftModel from transformers import AutoModelForCausalLM, LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig, CodeLlamaTokenizer import gc def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.float16): """ @@ -7,14 +17,6 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo """ # Delete the model object if it exists if os.path.exists(to): shutil.rmtree(to) @@ -53,15 +55,10 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo return base_model model_path = 'NousResearch/Llama-2-13b-hf' adapter_path = 'Example/Adapter-Path' quantization_config=BitsAndBytesConfig( load_in_4bit=True, -
ChrisHayduk renamed this gist
Aug 25, 2023 . 1 changed file with 0 additions and 0 deletions.There are no files selected for viewing
File renamed without changes. -
ChrisHayduk created this gist
Aug 25, 2023 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,106 @@ import torch def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.float16): """ 'model': the peftmodel you loaded with qlora. 'tokenizer': the model's corresponding hf's tokenizer. """ import peft import json import shutil from peft.utils import _get_submodules import os import bitsandbytes as bnb from bitsandbytes.functional import dequantize_4bit # Delete the model object if it exists if os.path.exists(to): shutil.rmtree(to) os.makedirs(to, exist_ok=True) cls = bnb.nn.Linear4bit base_model = model.model with torch.no_grad(): for name, module in base_model.named_modules(): if isinstance(module, cls): print(f"Dequantizing `{name}`...") weights = dequantize_4bit(module.weight.data, quant_state=module.weight.quant_state, quant_type="nf4") new_module = torch.nn.Linear(module.in_features, module.out_features, bias=module.bias) new_module.weight = torch.nn.Parameter(weights) new_module.bias = torch.nn.Parameter(module.bias) parent, target, target_name = _get_submodules(base_model, name) setattr(parent, target_name, new_module) # a hack, setting this to avoid hf's saving error because hf # itself does not support saving a model that is registered to be loaded in 4bit. base_model.is_loaded_in_4bit = False print("Saving dequantized model...") base_model.save_pretrained(to) tokenizer.save_pretrained(to) config_data = json.loads(open(os.path.join(to, 'config.json'), 'r').read()) config_data.pop("quantization_config", None) config_data.pop("pretraining_tp", None) with open(os.path.join(to, 'config.json'), 'w') as config: config.write(json.dumps(config_data, indent=2)) return base_model import torch from peft import PeftModel from transformers import AutoModelForCausalLM, LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig, CodeLlamaTokenizer import gc model_path = 'NousResearch/Llama-2-13b-hf' adapter_path = 'ChrisHayduk/QuerySurge-AI' quantization_config=BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", ) try: print(f"Starting to load the model {model_path} into memory") model = LlamaForCausalLM.from_pretrained( model_path, load_in_4bit=True, torch_dtype=torch.bfloat16, quantization_config=quantization_config, device_map={"": 0} ) print(model.model) tok = LlamaTokenizer.from_pretrained(model_path) model = dequantize_model(model, tok) model = PeftModel.from_pretrained(model = model, model_id = adapter_path) model = model.merge_and_unload() print(f"Successfully loaded the model {model_path} into memory") except Exception as e: print(f"An error occurred: {e}") # Delete the model object if it exists if 'model' in locals(): del model # Clear the GPU cache torch.cuda.empty_cache() # Run the garbage collection gc.collect() print("Model, GPU cache, and garbage have been cleared.")