Skip to content

Instantly share code, notes, and snippets.

@mzbac
Created April 27, 2024 06:51
Show Gist options
  • Save mzbac/c10ba6b8cad89942c8924a27e82a1455 to your computer and use it in GitHub Desktop.
Save mzbac/c10ba6b8cad89942c8924a27e82a1455 to your computer and use it in GitHub Desktop.

Revisions

  1. mzbac created this gist Apr 27, 2024.
    45 changes: 45 additions & 0 deletions gistfile1.txt
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,45 @@
    from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments,BitsAndBytesConfig
    from datasets import load_dataset

    model_name ="meta-llama/Meta-Llama-3-8B-Instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    dataset = load_dataset("glaiveai/glaive-function-calling-v2",split="train")

    def formatting_prompts_func(example):
    output_texts = []

    for i in range(len(example['system'])):
    messages = [
    {
    "role": "system",
    "content": example['system'][i][len("SYSTEM:"):].strip(),
    },
    ]
    conversations = example['chat'][i].split("<|endoftext|>")
    for message in conversations:
    message = message.strip()
    if message:
    if "USER:" in message:
    user_content = message.split("ASSISTANT:")[0].strip()
    messages.append({"role": "user", "content": user_content[5:].strip()})

    if "ASSISTANT:" in message:
    assistant_content = message.split("ASSISTANT:")[1].strip()
    messages.append({"role": "assistant", "content": assistant_content})
    elif message.startswith("FUNCTION RESPONSE:"):
    function_response = message[18:].strip()
    if "ASSISTANT:" in function_response:
    function_content, assistant_content = function_response.split("ASSISTANT:")
    messages.append({"role": "user", "content": function_content.strip()})
    messages.append({"role": "assistant", "content": assistant_content.strip()})
    else:
    messages.append({"role": "user", "content": function_response})
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
    output_texts.append(text)
    return {"text": output_texts}

    dataset = dataset.map(formatting_prompts_func, batched=True)
    dataset = dataset.remove_columns(["system", "chat"])

    dataset.push_to_hub("mzbac/glaive-function-calling-v2-llama-3-format")