Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save pavp-git/ee56c35f4c32cfec74feedfbb99d8cd0 to your computer and use it in GitHub Desktop.

Select an option

Save pavp-git/ee56c35f4c32cfec74feedfbb99d8cd0 to your computer and use it in GitHub Desktop.

Revisions

  1. @ChrisHayduk ChrisHayduk revised this gist Sep 29, 2023. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions merge_qlora_with_quantized_model.py
    Original file line number Diff line number Diff line change
    @@ -29,7 +29,7 @@ def save_model(model, tokenizer, to):
    with open(os.path.join(to, 'config.json'), 'w') as config:
    config.write(json.dumps(config_data, indent=2))

    def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfloat16, device="cuda"):
    def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfloat16, device="cpu"):
    """
    'model': the peftmodel you loaded with qlora.
    'tokenizer': the model's corresponding hf's tokenizer.
    @@ -88,7 +88,7 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfl
    load_in_4bit=True,
    torch_dtype=torch.bfloat16,
    quantization_config=quantization_config,
    device_map={"": 0}
    device_map="auto"
    )
    print(model)
    tok = LlamaTokenizer.from_pretrained(model_path)
  2. @ChrisHayduk ChrisHayduk revised this gist Sep 24, 2023. 1 changed file with 12 additions and 10 deletions.
    22 changes: 12 additions & 10 deletions merge_qlora_with_quantized_model.py
    Original file line number Diff line number Diff line change
    @@ -19,6 +19,16 @@
    import gc
    import copy

    def save_model(model, tokenizer, to):
    print(f"Saving dequantized model to {to}...")
    model.save_pretrained(to)
    tokenizer.save_pretrained(to)
    config_data = json.loads(open(os.path.join(to, 'config.json'), 'r').read())
    config_data.pop("quantization_config", None)
    config_data.pop("pretraining_tp", None)
    with open(os.path.join(to, 'config.json'), 'w') as config:
    config.write(json.dumps(config_data, indent=2))

    def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfloat16, device="cuda"):
    """
    'model': the peftmodel you loaded with qlora.
    @@ -55,14 +65,7 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfl

    model.is_loaded_in_4bit = False

    print("Saving dequantized model...")
    model.save_pretrained(to)
    tokenizer.save_pretrained(to)
    config_data = json.loads(open(os.path.join(to, 'config.json'), 'r').read())
    config_data.pop("quantization_config", None)
    config_data.pop("pretraining_tp", None)
    with open(os.path.join(to, 'config.json'), 'w') as config:
    config.write(json.dumps(config_data, indent=2))
    save_model(model, tokenizer, to)

    return model

    @@ -104,8 +107,7 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfl

    # Note that the output folder here should be different than the one you used for dequantize_model
    # This save will output the model merged with LoRA weights
    model.save_pretrained("put-output-folder-here")
    tokenizer.save_pretrained("put-output-folder-here")
    save_model(model, tok, "put-output-folder-here")

    print(f"Successfully saved merged model {model_path} to disk")

  3. @ChrisHayduk ChrisHayduk revised this gist Sep 11, 2023. 1 changed file with 0 additions and 4 deletions.
    4 changes: 0 additions & 4 deletions merge_qlora_with_quantized_model.py
    Original file line number Diff line number Diff line change
    @@ -53,8 +53,6 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfl
    parent, target, target_name = _get_submodules(model, name)
    setattr(parent, target_name, new_module)

    # a hack, setting this to avoid hf's saving error because hf
    # itself does not support saving a model that is registered to be loaded in 4bit.
    model.is_loaded_in_4bit = False

    print("Saving dequantized model...")
    @@ -69,8 +67,6 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfl
    return model




    model_path = 'Huggingface-base-model/path-goes-here'
    adapter_path = 'Huggingface-adapter/path-goes-here'

  4. @ChrisHayduk ChrisHayduk revised this gist Sep 11, 2023. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions merge_qlora_with_quantized_model.py
    Original file line number Diff line number Diff line change
    @@ -71,7 +71,7 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfl



    model_path = 'NousResearch/Llama-2-13b-hf'
    model_path = 'Huggingface-base-model/path-goes-here'
    adapter_path = 'Huggingface-adapter/path-goes-here'

    quantization_config=BitsAndBytesConfig(
    @@ -96,7 +96,7 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfl

    # Note: This function outputs the dequantized model without merging the adapter yet
    # The code below it will merge the adapter and then save it to disk
    model = dequantize_model(model, tok, to='/content/drive/MyDrive/QuerySurge AI/dequantized_model')
    model = dequantize_model(model, tok, to='output-folder-for-dequantized-model-here')

    print(model)
    model = PeftModel.from_pretrained(model = model, model_id = adapter_path)
  5. @ChrisHayduk ChrisHayduk revised this gist Sep 11, 2023. 1 changed file with 2 additions and 0 deletions.
    2 changes: 2 additions & 0 deletions merge_qlora_with_quantized_model.py
    Original file line number Diff line number Diff line change
    @@ -110,6 +110,8 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfl
    # This save will output the model merged with LoRA weights
    model.save_pretrained("put-output-folder-here")
    tokenizer.save_pretrained("put-output-folder-here")

    print(f"Successfully saved merged model {model_path} to disk")

    except Exception as e:
    print(f"An error occurred: {e}")
  6. @ChrisHayduk ChrisHayduk revised this gist Sep 11, 2023. 1 changed file with 9 additions and 0 deletions.
    9 changes: 9 additions & 0 deletions merge_qlora_with_quantized_model.py
    Original file line number Diff line number Diff line change
    @@ -93,14 +93,23 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfl
    )
    print(model)
    tok = LlamaTokenizer.from_pretrained(model_path)

    # Note: This function outputs the dequantized model without merging the adapter yet
    # The code below it will merge the adapter and then save it to disk
    model = dequantize_model(model, tok, to='/content/drive/MyDrive/QuerySurge AI/dequantized_model')

    print(model)
    model = PeftModel.from_pretrained(model = model, model_id = adapter_path)
    print(model)
    model = model.merge_and_unload()
    print(model)

    print(f"Successfully loaded the model {model_path} into memory")

    # Note that the output folder here should be different than the one you used for dequantize_model
    # This save will output the model merged with LoRA weights
    model.save_pretrained("put-output-folder-here")
    tokenizer.save_pretrained("put-output-folder-here")

    except Exception as e:
    print(f"An error occurred: {e}")
  7. @ChrisHayduk ChrisHayduk revised this gist Aug 29, 2023. 1 changed file with 0 additions and 2 deletions.
    2 changes: 0 additions & 2 deletions merge_qlora_with_quantized_model.py
    Original file line number Diff line number Diff line change
    @@ -1,7 +1,5 @@
    """
    NOTE:
    The code below combines approaches published by both @eugene-yh and @jinyongyoo on Github.
    Thanks for the contributions guys!
  8. @ChrisHayduk ChrisHayduk revised this gist Aug 29, 2023. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion merge_qlora_with_quantized_model.py
    Original file line number Diff line number Diff line change
    @@ -2,7 +2,7 @@
    NOTE:
    The code below combines approaches published both @eugene-yh and @jinyongyoo on Github.
    The code below combines approaches published by both @eugene-yh and @jinyongyoo on Github.
    Thanks for the contributions guys!
  9. @ChrisHayduk ChrisHayduk revised this gist Aug 29, 2023. 1 changed file with 7 additions and 1 deletion.
    8 changes: 7 additions & 1 deletion merge_qlora_with_quantized_model.py
    Original file line number Diff line number Diff line change
    @@ -1,5 +1,11 @@
    """
    The code below combines approaches published both @eugene-yh and @jinyongyoo on Github. Thanks for the contributions guys!
    NOTE:
    The code below combines approaches published both @eugene-yh and @jinyongyoo on Github.
    Thanks for the contributions guys!
    """

    import torch
  10. @ChrisHayduk ChrisHayduk revised this gist Aug 29, 2023. 1 changed file with 5 additions and 1 deletion.
    6 changes: 5 additions & 1 deletion merge_qlora_with_quantized_model.py
    Original file line number Diff line number Diff line change
    @@ -1,3 +1,7 @@
    """
    The code below combines approaches published both @eugene-yh and @jinyongyoo on Github. Thanks for the contributions guys!
    """

    import torch
    import peft
    import json
    @@ -64,7 +68,7 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfl


    model_path = 'NousResearch/Llama-2-13b-hf'
    adapter_path = 'ChrisHayduk/QuerySurge-AI'
    adapter_path = 'Huggingface-adapter/path-goes-here'

    quantization_config=BitsAndBytesConfig(
    load_in_4bit=True,
  11. @ChrisHayduk ChrisHayduk revised this gist Aug 25, 2023. 1 changed file with 0 additions and 2 deletions.
    2 changes: 0 additions & 2 deletions merge_qlora_with_quantized_model.py
    Original file line number Diff line number Diff line change
    @@ -36,8 +36,6 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfl

    quant_state[2] = dtype

    print(quant_state)

    weights = dequantize_4bit(module.weight.data, quant_state=quant_state, quant_type="nf4").to(dtype)

    new_module = torch.nn.Linear(module.in_features, module.out_features, bias=None, dtype=dtype)
  12. @ChrisHayduk ChrisHayduk revised this gist Aug 25, 2023. 1 changed file with 14 additions and 6 deletions.
    20 changes: 14 additions & 6 deletions merge_qlora_with_quantized_model.py
    Original file line number Diff line number Diff line change
    @@ -9,13 +9,15 @@
    from peft import PeftModel
    from transformers import AutoModelForCausalLM, LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig, CodeLlamaTokenizer
    import gc
    import copy

    import torch

    def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.float16, device="cuda"):
    def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.bfloat16, device="cuda"):
    """
    'model': the peftmodel you loaded with qlora.
    'tokenizer': the model's corresponding hf's tokenizer.
    'to': directory to save the dequantized model
    'dtype': dtype that the model was trained using
    'device': device to load the model to
    """

    # Delete the model object if it exists
    @@ -30,11 +32,17 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo
    for name, module in model.named_modules():
    if isinstance(module, cls):
    print(f"Dequantizing `{name}`...")
    weights = dequantize_4bit(module.weight.data, quant_state=module.weight.quant_state, quant_type="nf4")
    quant_state = copy.deepcopy(module.weight.quant_state)

    quant_state[2] = dtype

    print(quant_state)

    weights = dequantize_4bit(module.weight.data, quant_state=quant_state, quant_type="nf4").to(dtype)

    new_module = torch.nn.Linear(module.in_features, module.out_features, bias=None)
    new_module = torch.nn.Linear(module.in_features, module.out_features, bias=None, dtype=dtype)
    new_module.weight = torch.nn.Parameter(weights)
    new_module.to(device)
    new_module.to(device=device, dtype=dtype)

    parent, target, target_name = _get_submodules(model, name)
    setattr(parent, target_name, new_module)
  13. @ChrisHayduk ChrisHayduk revised this gist Aug 25, 2023. 1 changed file with 9 additions and 18 deletions.
    27 changes: 9 additions & 18 deletions merge_qlora_with_quantized_model.py
    Original file line number Diff line number Diff line change
    @@ -12,7 +12,7 @@

    import torch

    def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.float16):
    def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.float16, device="cuda"):
    """
    'model': the peftmodel you loaded with qlora.
    'tokenizer': the model's corresponding hf's tokenizer.
    @@ -32,9 +32,9 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo
    print(f"Dequantizing `{name}`...")
    weights = dequantize_4bit(module.weight.data, quant_state=module.weight.quant_state, quant_type="nf4")

    new_module = torch.nn.Linear(module.in_features, module.out_features, bias=module.bias)
    new_module = torch.nn.Linear(module.in_features, module.out_features, bias=None)
    new_module.weight = torch.nn.Parameter(weights)
    new_module.bias = torch.nn.Parameter(module.bias)
    new_module.to(device)

    parent, target, target_name = _get_submodules(model, name)
    setattr(parent, target_name, new_module)
    @@ -56,11 +56,6 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo



    import torch
    from peft import PeftModel
    from transformers import AutoModelForCausalLM, LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig, CodeLlamaTokenizer
    import gc


    model_path = 'NousResearch/Llama-2-13b-hf'
    adapter_path = 'ChrisHayduk/QuerySurge-AI'
    @@ -82,20 +77,16 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo
    quantization_config=quantization_config,
    device_map={"": 0}
    )

    print(model)
    tok = LlamaTokenizer.from_pretrained(model_path)

    print(f"Successfully loaded the model {model_path} into memory")

    print(f"Dequantizing the model")
    model = dequantize_model(model, tok, to='./dequantized_model')
    print(f"Successfully dequantized the model")

    print(f"Loading and merging the adapter {adapter_path}")
    model = dequantize_model(model, tok, to='/content/drive/MyDrive/QuerySurge AI/dequantized_model')
    print(model)
    model = PeftModel.from_pretrained(model = model, model_id = adapter_path)
    print(model)
    model = model.merge_and_unload()
    print(f"Successfully loaded and merged the adapter {adapter_path}")
    print(model)

    print(f"Successfully loaded the model {model_path} into memory")

    except Exception as e:
    print(f"An error occurred: {e}")
  14. @ChrisHayduk ChrisHayduk revised this gist Aug 25, 2023. No changes.
  15. @ChrisHayduk ChrisHayduk revised this gist Aug 25, 2023. 1 changed file with 24 additions and 44 deletions.
    68 changes: 24 additions & 44 deletions merge_qlora_with_quantized_model.py
    Original file line number Diff line number Diff line change
    @@ -10,13 +10,14 @@
    from transformers import AutoModelForCausalLM, LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig, CodeLlamaTokenizer
    import gc

    import torch

    def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.float16):
    """
    'model': the peftmodel you loaded with qlora.
    'tokenizer': the model's corresponding hf's tokenizer.
    """


    # Delete the model object if it exists
    if os.path.exists(to):
    shutil.rmtree(to)
    @@ -25,10 +26,8 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo

    cls = bnb.nn.Linear4bit

    base_model = model.model

    with torch.no_grad():
    for name, module in base_model.named_modules():
    for name, module in model.named_modules():
    if isinstance(module, cls):
    print(f"Dequantizing `{name}`...")
    weights = dequantize_4bit(module.weight.data, quant_state=module.weight.quant_state, quant_type="nf4")
    @@ -37,27 +36,34 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo
    new_module.weight = torch.nn.Parameter(weights)
    new_module.bias = torch.nn.Parameter(module.bias)

    parent, target, target_name = _get_submodules(base_model, name)
    parent, target, target_name = _get_submodules(model, name)
    setattr(parent, target_name, new_module)

    # a hack, setting this to avoid hf's saving error because hf
    # itself does not support saving a model that is registered to be loaded in 4bit.
    base_model.is_loaded_in_4bit = False
    model.is_loaded_in_4bit = False

    print("Saving dequantized model...")
    base_model.save_pretrained(to)
    model.save_pretrained(to)
    tokenizer.save_pretrained(to)
    config_data = json.loads(open(os.path.join(to, 'config.json'), 'r').read())
    config_data.pop("quantization_config", None)
    config_data.pop("pretraining_tp", None)
    with open(os.path.join(to, 'config.json'), 'w') as config:
    config.write(json.dumps(config_data, indent=2))

    return base_model
    return model



    import torch
    from peft import PeftModel
    from transformers import AutoModelForCausalLM, LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig, CodeLlamaTokenizer
    import gc


    model_path = 'NousResearch/Llama-2-13b-hf'
    adapter_path = 'ChrisHayduk/QuerySurge-AI'

    quantization_config=BitsAndBytesConfig(
    load_in_4bit=True,
    @@ -77,24 +83,19 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo
    device_map={"": 0}
    )

    print(model.model)

    tok = LlamaTokenizer.from_pretrained(model_path)
    model = dequantize_model(model, tok)

    print(f"Successfully loaded the model {model_path} into memory")

    # Delete the model object if it exists
    if 'model' in locals():
    del model

    # Clear the GPU cache
    torch.cuda.empty_cache()

    # Run the garbage collection
    gc.collect()

    print("Model, GPU cache, and garbage have been cleared.")
    print(f"Dequantizing the model")
    model = dequantize_model(model, tok, to='./dequantized_model')
    print(f"Successfully dequantized the model")

    print(f"Loading and merging the adapter {adapter_path}")
    model = PeftModel.from_pretrained(model = model, model_id = adapter_path)
    model = model.merge_and_unload()
    print(f"Successfully loaded and merged the adapter {adapter_path}")


    except Exception as e:
    print(f"An error occurred: {e}")
    @@ -109,25 +110,4 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo
    # Run the garbage collection
    gc.collect()

    print("Model, GPU cache, and garbage have been cleared.")

    #Now reload model into memory
    model_path = './dequantized_model'
    adapter_path = 'Example/Adapter-Path'

    try:
    print(f"Starting to load the model {model_path} with adapter {adapter_path} into memory")

    model = LlamaForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    device_map={"": 0}
    )

    print(model.model)

    tok = LlamaTokenizer.from_pretrained(model_path)
    model = PeftModel.from_pretrained(model = model, model_id = adapter_path)
    model = model.merge_and_unload()

    print(f"Successfully loaded the model {model_path} and adapter {adapter_path} into memory")
    print("Model, GPU cache, and garbage have been cleared.")
  16. @ChrisHayduk ChrisHayduk revised this gist Aug 25, 2023. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion merge_qlora_with_quantized_model.py
    Original file line number Diff line number Diff line change
    @@ -116,7 +116,7 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo
    adapter_path = 'Example/Adapter-Path'

    try:
    print(f"Starting to load the model {model_path} into memory")
    print(f"Starting to load the model {model_path} with adapter {adapter_path} into memory")

    model = LlamaForCausalLM.from_pretrained(
    model_path,
  17. @ChrisHayduk ChrisHayduk revised this gist Aug 25, 2023. 1 changed file with 34 additions and 4 deletions.
    38 changes: 34 additions & 4 deletions merge_qlora_with_quantized_model.py
    Original file line number Diff line number Diff line change
    @@ -58,7 +58,6 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo


    model_path = 'NousResearch/Llama-2-13b-hf'
    adapter_path = 'Example/Adapter-Path'

    quantization_config=BitsAndBytesConfig(
    load_in_4bit=True,
    @@ -82,10 +81,20 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo

    tok = LlamaTokenizer.from_pretrained(model_path)
    model = dequantize_model(model, tok)
    model = PeftModel.from_pretrained(model = model, model_id = adapter_path)
    model = model.merge_and_unload()

    print(f"Successfully loaded the model {model_path} into memory")

    # Delete the model object if it exists
    if 'model' in locals():
    del model

    # Clear the GPU cache
    torch.cuda.empty_cache()

    # Run the garbage collection
    gc.collect()

    print("Model, GPU cache, and garbage have been cleared.")

    except Exception as e:
    print(f"An error occurred: {e}")
    @@ -100,4 +109,25 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo
    # Run the garbage collection
    gc.collect()

    print("Model, GPU cache, and garbage have been cleared.")
    print("Model, GPU cache, and garbage have been cleared.")

    #Now reload model into memory
    model_path = './dequantized_model'
    adapter_path = 'Example/Adapter-Path'

    try:
    print(f"Starting to load the model {model_path} into memory")

    model = LlamaForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    device_map={"": 0}
    )

    print(model.model)

    tok = LlamaTokenizer.from_pretrained(model_path)
    model = PeftModel.from_pretrained(model = model, model_id = adapter_path)
    model = model.merge_and_unload()

    print(f"Successfully loaded the model {model_path} and adapter {adapter_path} into memory")
  18. @ChrisHayduk ChrisHayduk revised this gist Aug 25, 2023. 1 changed file with 11 additions and 14 deletions.
    25 changes: 11 additions & 14 deletions merge_qlora_with_quantized_model.py
    Original file line number Diff line number Diff line change
    @@ -1,4 +1,14 @@
    import torch
    import peft
    import json
    import shutil
    from peft.utils import _get_submodules
    import os
    import bitsandbytes as bnb
    from bitsandbytes.functional import dequantize_4bit
    from peft import PeftModel
    from transformers import AutoModelForCausalLM, LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig, CodeLlamaTokenizer
    import gc

    def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.float16):
    """
    @@ -7,14 +17,6 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo
    """


    import peft
    import json
    import shutil
    from peft.utils import _get_submodules
    import os
    import bitsandbytes as bnb
    from bitsandbytes.functional import dequantize_4bit

    # Delete the model object if it exists
    if os.path.exists(to):
    shutil.rmtree(to)
    @@ -53,15 +55,10 @@ def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.flo

    return base_model


    import torch
    from peft import PeftModel
    from transformers import AutoModelForCausalLM, LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig, CodeLlamaTokenizer
    import gc


    model_path = 'NousResearch/Llama-2-13b-hf'
    adapter_path = 'ChrisHayduk/QuerySurge-AI'
    adapter_path = 'Example/Adapter-Path'

    quantization_config=BitsAndBytesConfig(
    load_in_4bit=True,
  19. @ChrisHayduk ChrisHayduk renamed this gist Aug 25, 2023. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  20. @ChrisHayduk ChrisHayduk created this gist Aug 25, 2023.
    106 changes: 106 additions & 0 deletions gistfile1.txt
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,106 @@
    import torch

    def dequantize_model(model, tokenizer, to='./dequantized_model', dtype=torch.float16):
    """
    'model': the peftmodel you loaded with qlora.
    'tokenizer': the model's corresponding hf's tokenizer.
    """


    import peft
    import json
    import shutil
    from peft.utils import _get_submodules
    import os
    import bitsandbytes as bnb
    from bitsandbytes.functional import dequantize_4bit

    # Delete the model object if it exists
    if os.path.exists(to):
    shutil.rmtree(to)

    os.makedirs(to, exist_ok=True)

    cls = bnb.nn.Linear4bit

    base_model = model.model

    with torch.no_grad():
    for name, module in base_model.named_modules():
    if isinstance(module, cls):
    print(f"Dequantizing `{name}`...")
    weights = dequantize_4bit(module.weight.data, quant_state=module.weight.quant_state, quant_type="nf4")

    new_module = torch.nn.Linear(module.in_features, module.out_features, bias=module.bias)
    new_module.weight = torch.nn.Parameter(weights)
    new_module.bias = torch.nn.Parameter(module.bias)

    parent, target, target_name = _get_submodules(base_model, name)
    setattr(parent, target_name, new_module)

    # a hack, setting this to avoid hf's saving error because hf
    # itself does not support saving a model that is registered to be loaded in 4bit.
    base_model.is_loaded_in_4bit = False

    print("Saving dequantized model...")
    base_model.save_pretrained(to)
    tokenizer.save_pretrained(to)
    config_data = json.loads(open(os.path.join(to, 'config.json'), 'r').read())
    config_data.pop("quantization_config", None)
    config_data.pop("pretraining_tp", None)
    with open(os.path.join(to, 'config.json'), 'w') as config:
    config.write(json.dumps(config_data, indent=2))

    return base_model


    import torch
    from peft import PeftModel
    from transformers import AutoModelForCausalLM, LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig, CodeLlamaTokenizer
    import gc


    model_path = 'NousResearch/Llama-2-13b-hf'
    adapter_path = 'ChrisHayduk/QuerySurge-AI'

    quantization_config=BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    )

    try:
    print(f"Starting to load the model {model_path} into memory")

    model = LlamaForCausalLM.from_pretrained(
    model_path,
    load_in_4bit=True,
    torch_dtype=torch.bfloat16,
    quantization_config=quantization_config,
    device_map={"": 0}
    )

    print(model.model)

    tok = LlamaTokenizer.from_pretrained(model_path)
    model = dequantize_model(model, tok)
    model = PeftModel.from_pretrained(model = model, model_id = adapter_path)
    model = model.merge_and_unload()

    print(f"Successfully loaded the model {model_path} into memory")

    except Exception as e:
    print(f"An error occurred: {e}")

    # Delete the model object if it exists
    if 'model' in locals():
    del model

    # Clear the GPU cache
    torch.cuda.empty_cache()

    # Run the garbage collection
    gc.collect()

    print("Model, GPU cache, and garbage have been cleared.")