pavp-git · October 25, 2023 05:59 · Sep 29, 2023 · Sep 24, 2023 · Sep 11, 2023 · Sep 11, 2023
diff --git a/merge_qlora_with_quantized_model.py b/merge_qlora_with_quantized_model.py
@@ -29,7 +29,7 @@ def save_model(model, tokenizer, to):
     with open(os.path.join(to, 'config.json'), 'w') as config:
         config.write(json.dumps(config_data, indent=2))
 
-def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfloat16, device="cuda"):
+def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfloat16, device="cpu"):
     """
     'model': the peftmodel you loaded with qlora.
     'tokenizer': the model's corresponding hf's tokenizer.
@@ -88,7 +88,7 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfl
         load_in_4bit=True,
         torch_dtype=torch.bfloat16,
         quantization_config=quantization_config,
-        device_map={"": 0}
+        device_map="auto"
     )
     print(model)
     tok = LlamaTokenizer.from_pretrained(model_path)

diff --git a/merge_qlora_with_quantized_model.py b/merge_qlora_with_quantized_model.py
@@ -19,6 +19,16 @@
 import gc
 import copy
 
+def save_model(model, tokenizer, to):
+    print(f"Saving dequantized model to {to}...")
+    model.save_pretrained(to)
+    tokenizer.save_pretrained(to)
+    config_data = json.loads(open(os.path.join(to, 'config.json'), 'r').read())
+    config_data.pop("quantization_config", None)
+    config_data.pop("pretraining_tp", None)
+    with open(os.path.join(to, 'config.json'), 'w') as config:
+        config.write(json.dumps(config_data, indent=2))
+
 def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfloat16, device="cuda"):
     """
     'model': the peftmodel you loaded with qlora.
@@ -55,14 +65,7 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfl
 
         model.is_loaded_in_4bit = False
 
-        print("Saving dequantized model...")
-        model.save_pretrained(to)
-        tokenizer.save_pretrained(to)
-        config_data = json.loads(open(os.path.join(to, 'config.json'), 'r').read())
-        config_data.pop("quantization_config", None)
-        config_data.pop("pretraining_tp", None)
-        with open(os.path.join(to, 'config.json'), 'w') as config:
-            config.write(json.dumps(config_data, indent=2))
+        save_model(model, tokenizer, to)
 
         return model
 
@@ -104,8 +107,7 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfl
 
     # Note that the output folder here should be different than the one you used for dequantize_model
     # This save will output the model merged with LoRA weights
-    model.save_pretrained("put-output-folder-here")
-    tokenizer.save_pretrained("put-output-folder-here")
+    save_model(model, tok, "put-output-folder-here")
 
     print(f"Successfully saved merged model {model_path} to disk")
 

diff --git a/merge_qlora_with_quantized_model.py b/merge_qlora_with_quantized_model.py
@@ -53,8 +53,6 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfl
                 parent, target, target_name = _get_submodules(model, name)
                 setattr(parent, target_name, new_module)
 
-        # a hack, setting this to avoid hf's saving error because hf
-        # itself does not support saving a model that is registered to be loaded in 4bit.
         model.is_loaded_in_4bit = False
 
         print("Saving dequantized model...")
@@ -69,8 +67,6 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfl
         return model
 
 
-
-
 model_path = 'Huggingface-base-model/path-goes-here'
 adapter_path = 'Huggingface-adapter/path-goes-here'
 

diff --git a/merge_qlora_with_quantized_model.py b/merge_qlora_with_quantized_model.py
@@ -71,7 +71,7 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfl
 
 
 
-model_path = 'NousResearch/Llama-2-13b-hf'
+model_path = 'Huggingface-base-model/path-goes-here'
 adapter_path = 'Huggingface-adapter/path-goes-here'
 
 quantization_config=BitsAndBytesConfig(
@@ -96,7 +96,7 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfl
 
     # Note: This function outputs the dequantized model without merging the adapter yet
     # The code below it will merge the adapter and then save it to disk
-    model = dequantize_model(model, tok, to='/content/drive/MyDrive/QuerySurge AI/dequantized_model')
+    model = dequantize_model(model, tok, to='output-folder-for-dequantized-model-here')
 
     print(model)
     model = PeftModel.from_pretrained(model = model, model_id = adapter_path)

diff --git a/merge_qlora_with_quantized_model.py b/merge_qlora_with_quantized_model.py
@@ -110,6 +110,8 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfl
     # This save will output the model merged with LoRA weights
     model.save_pretrained("put-output-folder-here")
     tokenizer.save_pretrained("put-output-folder-here")
+
+    print(f"Successfully saved merged model {model_path} to disk")
 
 except Exception as e:
     print(f"An error occurred: {e}")

diff --git a/merge_qlora_with_quantized_model.py b/merge_qlora_with_quantized_model.py
@@ -93,14 +93,23 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfl
     )
     print(model)
     tok = LlamaTokenizer.from_pretrained(model_path)
+
+    # Note: This function outputs the dequantized model without merging the adapter yet
+    # The code below it will merge the adapter and then save it to disk
     model = dequantize_model(model, tok, to='/content/drive/MyDrive/QuerySurge AI/dequantized_model')
+
     print(model)
     model = PeftModel.from_pretrained(model = model, model_id = adapter_path)
     print(model)
     model = model.merge_and_unload()
     print(model)
 
     print(f"Successfully loaded the model {model_path} into memory")
+
+    # Note that the output folder here should be different than the one you used for dequantize_model
+    # This save will output the model merged with LoRA weights
+    model.save_pretrained("put-output-folder-here")
+    tokenizer.save_pretrained("put-output-folder-here")
 
 except Exception as e:
     print(f"An error occurred: {e}")

diff --git a/merge_qlora_with_quantized_model.py b/merge_qlora_with_quantized_model.py
@@ -1,7 +1,5 @@
 """
 
-NOTE:
-
 The code below combines approaches published by both @eugene-yh and @jinyongyoo on Github. 
 
 Thanks for the contributions guys!

diff --git a/merge_qlora_with_quantized_model.py b/merge_qlora_with_quantized_model.py
@@ -2,7 +2,7 @@
 
 NOTE:
 
-The code below combines approaches published both @eugene-yh and @jinyongyoo on Github. 
+The code below combines approaches published by both @eugene-yh and @jinyongyoo on Github. 
 
 Thanks for the contributions guys!
 

diff --git a/merge_qlora_with_quantized_model.py b/merge_qlora_with_quantized_model.py
@@ -1,5 +1,11 @@
 """
-The code below combines approaches published both @eugene-yh and @jinyongyoo on Github. Thanks for the contributions guys!
+
+NOTE:
+
+The code below combines approaches published both @eugene-yh and @jinyongyoo on Github. 
+
+Thanks for the contributions guys!
+
 """
 
 import torch

diff --git a/merge_qlora_with_quantized_model.py b/merge_qlora_with_quantized_model.py
@@ -1,3 +1,7 @@
+"""
+The code below combines approaches published both @eugene-yh and @jinyongyoo on Github. Thanks for the contributions guys!
+"""
+
 import torch
 import peft
 import json
@@ -64,7 +68,7 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfl
 
 
 model_path = 'NousResearch/Llama-2-13b-hf'
-adapter_path = 'ChrisHayduk/QuerySurge-AI'
+adapter_path = 'Huggingface-adapter/path-goes-here'
 
 quantization_config=BitsAndBytesConfig(
             load_in_4bit=True,

diff --git a/merge_qlora_with_quantized_model.py b/merge_qlora_with_quantized_model.py
@@ -36,8 +36,6 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfl
 
                 quant_state[2] = dtype
 
-                print(quant_state)
-
                 weights = dequantize_4bit(module.weight.data, quant_state=quant_state, quant_type="nf4").to(dtype)
 
                 new_module = torch.nn.Linear(module.in_features, module.out_features, bias=None, dtype=dtype)

diff --git a/merge_qlora_with_quantized_model.py b/merge_qlora_with_quantized_model.py
@@ -9,13 +9,15 @@
 from peft import PeftModel
 from transformers import AutoModelForCausalLM, LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig, CodeLlamaTokenizer
 import gc
+import copy
 
-import torch
-
-def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.float16, device="cuda"):
+def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfloat16, device="cuda"):
     """
     'model': the peftmodel you loaded with qlora.
     'tokenizer': the model's corresponding hf's tokenizer.
+    'to': directory to save the dequantized model
+    'dtype': dtype that the model was trained using
+    'device': device to load the model to
     """
 
     # Delete the model object if it exists
@@ -30,11 +32,17 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo
         for name, module in model.named_modules():
             if isinstance(module, cls):
                 print(f"Dequantizing `{name}`...")
-                weights = dequantize_4bit(module.weight.data, quant_state=module.weight.quant_state, quant_type="nf4")
+                quant_state = copy.deepcopy(module.weight.quant_state)
+
+                quant_state[2] = dtype
+
+                print(quant_state)
+
+                weights = dequantize_4bit(module.weight.data, quant_state=quant_state, quant_type="nf4").to(dtype)
 
-                new_module = torch.nn.Linear(module.in_features, module.out_features, bias=None)
+                new_module = torch.nn.Linear(module.in_features, module.out_features, bias=None, dtype=dtype)
                 new_module.weight = torch.nn.Parameter(weights)
-                new_module.to(device)
+                new_module.to(device=device, dtype=dtype)
 
                 parent, target, target_name = _get_submodules(model, name)
                 setattr(parent, target_name, new_module)

diff --git a/merge_qlora_with_quantized_model.py b/merge_qlora_with_quantized_model.py
@@ -12,7 +12,7 @@
 
 import torch
 
-def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.float16):
+def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.float16, device="cuda"):
     """
     'model': the peftmodel you loaded with qlora.
     'tokenizer': the model's corresponding hf's tokenizer.
@@ -32,9 +32,9 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo
                 print(f"Dequantizing `{name}`...")
                 weights = dequantize_4bit(module.weight.data, quant_state=module.weight.quant_state, quant_type="nf4")
 
-                new_module = torch.nn.Linear(module.in_features, module.out_features, bias=module.bias)
+                new_module = torch.nn.Linear(module.in_features, module.out_features, bias=None)
                 new_module.weight = torch.nn.Parameter(weights)
-                new_module.bias = torch.nn.Parameter(module.bias)
+                new_module.to(device)
 
                 parent, target, target_name = _get_submodules(model, name)
                 setattr(parent, target_name, new_module)
@@ -56,11 +56,6 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo
 
 
 
-import torch
-from peft import PeftModel
-from transformers import AutoModelForCausalLM, LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig, CodeLlamaTokenizer
-import gc
-
 
 model_path = 'NousResearch/Llama-2-13b-hf'
 adapter_path = 'ChrisHayduk/QuerySurge-AI'
@@ -82,20 +77,16 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo
         quantization_config=quantization_config,
         device_map={"": 0}
     )
-
+    print(model)
     tok = LlamaTokenizer.from_pretrained(model_path)
-
-    print(f"Successfully loaded the model {model_path} into memory")
-
-    print(f"Dequantizing the model")
-    model = dequantize_model(model, tok, to='./dequantized_model')
-    print(f"Successfully dequantized the model")
-
-    print(f"Loading and merging the adapter {adapter_path}")
+    model = dequantize_model(model, tok, to='/content/drive/MyDrive/QuerySurge AI/dequantized_model')
+    print(model)
     model = PeftModel.from_pretrained(model = model, model_id = adapter_path)
+    print(model)
     model = model.merge_and_unload()
-    print(f"Successfully loaded and merged the adapter {adapter_path}")
+    print(model)
 
+    print(f"Successfully loaded the model {model_path} into memory")
 
 except Exception as e:
     print(f"An error occurred: {e}")

diff --git a/merge_qlora_with_quantized_model.py b/merge_qlora_with_quantized_model.py
@@ -10,13 +10,14 @@
 from transformers import AutoModelForCausalLM, LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig, CodeLlamaTokenizer
 import gc
 
+import torch
+
 def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.float16):
     """
     'model': the peftmodel you loaded with qlora.
     'tokenizer': the model's corresponding hf's tokenizer.
     """
 
-
     # Delete the model object if it exists
     if os.path.exists(to):
         shutil.rmtree(to)
@@ -25,10 +26,8 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo
 
     cls = bnb.nn.Linear4bit
 
-    base_model = model.model
-
     with torch.no_grad():
-        for name, module in base_model.named_modules():
+        for name, module in model.named_modules():
             if isinstance(module, cls):
                 print(f"Dequantizing `{name}`...")
                 weights = dequantize_4bit(module.weight.data, quant_state=module.weight.quant_state, quant_type="nf4")
@@ -37,27 +36,34 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo
                 new_module.weight = torch.nn.Parameter(weights)
                 new_module.bias = torch.nn.Parameter(module.bias)
 
-                parent, target, target_name = _get_submodules(base_model, name)
+                parent, target, target_name = _get_submodules(model, name)
                 setattr(parent, target_name, new_module)
 
         # a hack, setting this to avoid hf's saving error because hf
         # itself does not support saving a model that is registered to be loaded in 4bit.
-        base_model.is_loaded_in_4bit = False
+        model.is_loaded_in_4bit = False
 
         print("Saving dequantized model...")
-        base_model.save_pretrained(to)
+        model.save_pretrained(to)
         tokenizer.save_pretrained(to)
         config_data = json.loads(open(os.path.join(to, 'config.json'), 'r').read())
         config_data.pop("quantization_config", None)
         config_data.pop("pretraining_tp", None)
         with open(os.path.join(to, 'config.json'), 'w') as config:
             config.write(json.dumps(config_data, indent=2))
 
-        return base_model
+        return model
 
 
 
+import torch
+from peft import PeftModel
+from transformers import AutoModelForCausalLM, LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig, CodeLlamaTokenizer
+import gc
+
+
 model_path = 'NousResearch/Llama-2-13b-hf'
+adapter_path = 'ChrisHayduk/QuerySurge-AI'
 
 quantization_config=BitsAndBytesConfig(
             load_in_4bit=True,
@@ -77,24 +83,19 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo
         device_map={"": 0}
     )
 
-    print(model.model)
-
     tok = LlamaTokenizer.from_pretrained(model_path)
-    model = dequantize_model(model, tok)
 
     print(f"Successfully loaded the model {model_path} into memory")
 
-    # Delete the model object if it exists
-    if 'model' in locals():
-        del model
-
-    # Clear the GPU cache
-    torch.cuda.empty_cache()
-
-    # Run the garbage collection
-    gc.collect()
-
-    print("Model, GPU cache, and garbage have been cleared.")
+    print(f"Dequantizing the model")
+    model = dequantize_model(model, tok, to='./dequantized_model')
+    print(f"Successfully dequantized the model")
+
+    print(f"Loading and merging the adapter {adapter_path}")
+    model = PeftModel.from_pretrained(model = model, model_id = adapter_path)
+    model = model.merge_and_unload()
+    print(f"Successfully loaded and merged the adapter {adapter_path}")
+
 
 except Exception as e:
     print(f"An error occurred: {e}")
@@ -109,25 +110,4 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo
     # Run the garbage collection
     gc.collect()
 
-    print("Model, GPU cache, and garbage have been cleared.")
-
-#Now reload model into memory
-model_path = './dequantized_model'
-adapter_path = 'Example/Adapter-Path'
-
-try:
-    print(f"Starting to load the model {model_path} with adapter {adapter_path} into memory")
-
-    model = LlamaForCausalLM.from_pretrained(
-        model_path,
-        torch_dtype=torch.bfloat16,
-        device_map={"": 0}
-    )
-
-    print(model.model)
-
-    tok = LlamaTokenizer.from_pretrained(model_path)
-    model = PeftModel.from_pretrained(model = model, model_id = adapter_path)
-    model = model.merge_and_unload()
-
-    print(f"Successfully loaded the model {model_path} and adapter {adapter_path} into memory")
+    print("Model, GPU cache, and garbage have been cleared.")
diff --git a/merge_qlora_with_quantized_model.py b/merge_qlora_with_quantized_model.py
@@ -116,7 +116,7 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo
 adapter_path = 'Example/Adapter-Path'
 
 try:
-    print(f"Starting to load the model {model_path} into memory")
+    print(f"Starting to load the model {model_path} with adapter {adapter_path} into memory")
 
     model = LlamaForCausalLM.from_pretrained(
         model_path,

diff --git a/merge_qlora_with_quantized_model.py b/merge_qlora_with_quantized_model.py
@@ -58,7 +58,6 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo
 
 
 model_path = 'NousResearch/Llama-2-13b-hf'
-adapter_path = 'Example/Adapter-Path'
 
 quantization_config=BitsAndBytesConfig(
             load_in_4bit=True,
@@ -82,10 +81,20 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo
 
     tok = LlamaTokenizer.from_pretrained(model_path)
     model = dequantize_model(model, tok)
-    model = PeftModel.from_pretrained(model = model, model_id = adapter_path)
-    model = model.merge_and_unload()
 
     print(f"Successfully loaded the model {model_path} into memory")
+
+    # Delete the model object if it exists
+    if 'model' in locals():
+        del model
+
+    # Clear the GPU cache
+    torch.cuda.empty_cache()
+
+    # Run the garbage collection
+    gc.collect()
+
+    print("Model, GPU cache, and garbage have been cleared.")
 
 except Exception as e:
     print(f"An error occurred: {e}")
@@ -100,4 +109,25 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo
     # Run the garbage collection
     gc.collect()
 
-    print("Model, GPU cache, and garbage have been cleared.")
+    print("Model, GPU cache, and garbage have been cleared.")
+
+#Now reload model into memory
+model_path = './dequantized_model'
+adapter_path = 'Example/Adapter-Path'
+
+try:
+    print(f"Starting to load the model {model_path} into memory")
+
+    model = LlamaForCausalLM.from_pretrained(
+        model_path,
+        torch_dtype=torch.bfloat16,
+        device_map={"": 0}
+    )
+
+    print(model.model)
+
+    tok = LlamaTokenizer.from_pretrained(model_path)
+    model = PeftModel.from_pretrained(model = model, model_id = adapter_path)
+    model = model.merge_and_unload()
+
+    print(f"Successfully loaded the model {model_path} and adapter {adapter_path} into memory")
diff --git a/merge_qlora_with_quantized_model.py b/merge_qlora_with_quantized_model.py
@@ -1,4 +1,14 @@
 import torch
+import peft
+import json
+import shutil
+from peft.utils import _get_submodules
+import os
+import bitsandbytes as bnb
+from bitsandbytes.functional import dequantize_4bit
+from peft import PeftModel
+from transformers import AutoModelForCausalLM, LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig, CodeLlamaTokenizer
+import gc
 
 def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.float16):
     """
@@ -7,14 +17,6 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo
     """
 
 
-    import peft
-    import json
-    import shutil
-    from peft.utils import _get_submodules
-    import os
-    import bitsandbytes as bnb
-    from bitsandbytes.functional import dequantize_4bit
-
     # Delete the model object if it exists
     if os.path.exists(to):
         shutil.rmtree(to)
@@ -53,15 +55,10 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo
 
         return base_model
 
-
-import torch
-from peft import PeftModel
-from transformers import AutoModelForCausalLM, LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig, CodeLlamaTokenizer
-import gc
 
 
 model_path = 'NousResearch/Llama-2-13b-hf'
-adapter_path = 'ChrisHayduk/QuerySurge-AI'
+adapter_path = 'Example/Adapter-Path'
 
 quantization_config=BitsAndBytesConfig(
             load_in_4bit=True,

diff --git a/gistfile1.txt → merge_qlora_with_quantized_model.py b/gistfile1.txt → merge_qlora_with_quantized_model.py
diff --git a/gistfile1.txt b/gistfile1.txt
@@ -0,0 +1,106 @@
+import torch
+
+def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.float16):
+    """
+    'model': the peftmodel you loaded with qlora.
+    'tokenizer': the model's corresponding hf's tokenizer.
+    """
+
+
+    import peft
+    import json
+    import shutil
+    from peft.utils import _get_submodules
+    import os
+    import bitsandbytes as bnb
+    from bitsandbytes.functional import dequantize_4bit
+
+    # Delete the model object if it exists
+    if os.path.exists(to):
+        shutil.rmtree(to)
+
+    os.makedirs(to, exist_ok=True)
+
+    cls = bnb.nn.Linear4bit
+
+    base_model = model.model
+
+    with torch.no_grad():
+        for name, module in base_model.named_modules():
+            if isinstance(module, cls):
+                print(f"Dequantizing `{name}`...")
+                weights = dequantize_4bit(module.weight.data, quant_state=module.weight.quant_state, quant_type="nf4")
+
+                new_module = torch.nn.Linear(module.in_features, module.out_features, bias=module.bias)
+                new_module.weight = torch.nn.Parameter(weights)
+                new_module.bias = torch.nn.Parameter(module.bias)
+
+                parent, target, target_name = _get_submodules(base_model, name)
+                setattr(parent, target_name, new_module)
+
+        # a hack, setting this to avoid hf's saving error because hf
+        # itself does not support saving a model that is registered to be loaded in 4bit.
+        base_model.is_loaded_in_4bit = False
+
+        print("Saving dequantized model...")
+        base_model.save_pretrained(to)
+        tokenizer.save_pretrained(to)
+        config_data = json.loads(open(os.path.join(to, 'config.json'), 'r').read())
+        config_data.pop("quantization_config", None)
+        config_data.pop("pretraining_tp", None)
+        with open(os.path.join(to, 'config.json'), 'w') as config:
+            config.write(json.dumps(config_data, indent=2))
+
+        return base_model
+
+
+import torch
+from peft import PeftModel
+from transformers import AutoModelForCausalLM, LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig, CodeLlamaTokenizer
+import gc
+
+
+model_path = 'NousResearch/Llama-2-13b-hf'
+adapter_path = 'ChrisHayduk/QuerySurge-AI'
+
+quantization_config=BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.bfloat16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4",
+        )
+
+try:
+    print(f"Starting to load the model {model_path} into memory")
+
+    model = LlamaForCausalLM.from_pretrained(
+        model_path,
+        load_in_4bit=True,
+        torch_dtype=torch.bfloat16,
+        quantization_config=quantization_config,
+        device_map={"": 0}
+    )
+
+    print(model.model)
+
+    tok = LlamaTokenizer.from_pretrained(model_path)
+    model = dequantize_model(model, tok)
+    model = PeftModel.from_pretrained(model = model, model_id = adapter_path)
+    model = model.merge_and_unload()
+
+    print(f"Successfully loaded the model {model_path} into memory")
+
+except Exception as e:
+    print(f"An error occurred: {e}")
+
+    # Delete the model object if it exists
+    if 'model' in locals():
+        del model
+
+    # Clear the GPU cache
+    torch.cuda.empty_cache()
+
+    # Run the garbage collection
+    gc.collect()
+
+    print("Model, GPU cache, and garbage have been cleared.")
No results found