Skip to content

Instantly share code, notes, and snippets.

@ssghost
Created June 30, 2025 08:52
Show Gist options
  • Select an option

  • Save ssghost/c9d696dab3fe4eccb624c9a989db3803 to your computer and use it in GitHub Desktop.

Select an option

Save ssghost/c9d696dab3fe4eccb624c9a989db3803 to your computer and use it in GitHub Desktop.

Revisions

  1. ssghost created this gist Jun 30, 2025.
    202 changes: 202 additions & 0 deletions reward.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,202 @@
    import requests
    from collections import OrderedDict
    import logging
    import re
    import json
    from time import sleep

    def count_xml(text) -> float:
    count = 0.0
    if text.count("<reasoning>\n") == 1:
    count += 0.125
    if text.count("\n</reasoning>\n") == 1:
    count += 0.125
    if text.count("\n<answer>\n") == 1:
    count += 0.125
    count -= len(text.split("\n</answer>\n")[-1])*0.001
    if text.count("\n</answer>") == 1:
    count += 0.125
    count -= (len(text.split("\n</answer>")[-1]) - 1)*0.001
    return count

    def get_score(text: str) -> float:
    base_url = "https://api.zerogpt.com/api/detect/detectText"
    headers = OrderedDict((
    ("Host", "api.zerogpt.com"),
    ("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:135.0) Gecko/20100101 Firefox/135.0"),
    ("Accept", "application/json, text/plain, */*"),
    ("Accept-Language", "en-US,en;q=0.5"),
    ("Accept-Encoding", "gzip, deflate, br, zstd"),
    ("Content-Type", "application/json"),
    ("Origin", "https://www.zerogpt.com"),
    ("Connection", "keep-alive"),
    ("Referer", "https://www.zerogpt.com/"),
    ("Sec-Fetch-Dest", "empty"),
    ("Sec-Fetch-Mode", "cors"),
    ("Sec-Fetch-Site", "same-site"),
    ("Priority", "u=0"),
    ("Pragma", "no-cache"),
    ("Cache-Control", "no-cache")
    ))

    data = {
    "input_text": text,
    }

    r = requests.post(base_url, headers=headers, json=data)

    #return r.json()
    j = r.json()

    if j['code'] != 200:
    logging.error(f"Failed to get score. Response: {j}")
    sleep(1)
    return 0.0 # Not ideal, but we need to return a float in all cases

    fake_percentage = j["data"]["fakePercentage"]
    return 100.0 - fake_percentage # Penalize for highly fake content

    def extract_answer_content(text: str) -> str | None:
    """Extract just the content between <answer> tags"""
    try:
    answer = text.split("<answer>")[-1]
    answer = answer.split("</answer>")[0]
    return answer.strip()
    except:
    return None

    def extract_reasoning_content(text: str) -> str | None:
    """Extract just the content between <reasoning> tags"""
    try:
    reasoning = text.split("<reasoning>")[-1]
    reasoning = reasoning.split("</reasoning>")[0]
    return reasoning.strip()
    except:
    return None

    def tiered_reward_func(completions, **kwargs) -> list[float]:
    """
    Tiered reward function for ZeroGPT. Rewards are broken down into multiple tiers
    """
    responses = [completion[0]["content"] for completion in completions]
    rewards = []
    strict_pattern = r"^<reasoning>\n.*?\n</reasoning>\n<answer>\n.*?\n</answer>$"

    for response in responses:
    reward = 0.0
    response = response.strip()

    # Tier 0: Individual tag rewards
    if response.count("<reasoning>") == 1:
    reward += 0.1
    if response.count("</reasoning>") == 1:
    reward += 0.1
    if response.count("<answer>") == 1:
    reward += 0.1
    if response.count("</answer>") == 1:
    reward += 0.1

    # Tier 1: Format checking
    soft_pattern = r"<reasoning>.*?</reasoning>\s*<answer>.*?</answer>"
    if re.search(soft_pattern, response, re.DOTALL):
    reward += 0.25

    xml_score = count_xml(response)
    reward += xml_score

    strict_match = re.match(strict_pattern, response)

    # Tier 2: Only add ZeroGPT score if strict format passes
    if strict_match:
    reward += 0.5

    # Extract just the answer content for check
    answer_content = extract_answer_content(response)
    if answer_content:
    reward += get_score(answer_content) / 5.0 # Normalize to 1-20 range

    reasoning_content = extract_reasoning_content(response)
    if reasoning_content:
    reasoning_len = len(reasoning_content)
    reasoning_len = min(reasoning_len, 500)
    reward += 0.005 * reasoning_len # Max out at reward of +2.5

    rewards.append(reward)

    # Print the response with the highest reward
    best_idx = rewards.index(max(rewards))
    print(f"\nBest response (reward: {rewards[best_idx]:.3f}):\n{responses[best_idx]}\n{'-'*40}")

    return rewards


    from transformers import pipeline
    selected_model = "madhurjindal/autonlp-Gibberish-Detector-492513457"
    classifier = pipeline("text-classification", model=selected_model)
    #classifier("I love Machine Learning!", top_k=100)
    # Returns:
    #[{'label': 'clean', 'score': 0.713941216468811}, {'label': 'mild gibberish', 'score': 0.1742146760225296}, {'label': 'word salad', 'score': 0.08730163425207138}, {'label': 'noise', 'score': 0.02454242669045925}]

    def get_gibberish_score(text: str) -> float:
    lines = text.split("\n")
    results = []
    for line in lines:
    line = line.strip()
    if line: # Skip empty lines
    clf_response = classifier(line, top_k=100)
    # Find the 'clean' score
    clean_score = 0.0
    for r in clf_response:
    if r['label'] == 'clean':
    clean_score = r['score']
    break
    results.append(clean_score)
    # Return the average clean score
    #return sum(results) / len(results)
    # Model learned to cheat this, so instead we'll return the minimum clean score
    #return min(results)
    # Apply an extra harsh penalty for gibberish by returning the square of the minimum clean score
    return min(results) ** 2

    def only_zerogpt_reward_func(completions, **kwargs) -> list[float]:
    """
    Reward function for ZeroGPT. Only uses ZeroGPT API to score completions
    """
    responses = [completion[0]["content"] for completion in completions]
    rewards = [get_score(response) for response in responses]
    gibberish_multipliers = [get_gibberish_score(response) for response in responses]
    # Multiply the rewards by the gibberish multipliers
    rewards = [r * g for r, g in zip(rewards, gibberish_multipliers)]

    # Print the response with the highest reward
    best_idx = rewards.index(max(rewards))
    print(f"\nBest response (reward: {rewards[best_idx]:.3f}):\n{responses[best_idx]}\n{'-'*40}")

    return rewards

    SYSTEM_PROMPT = """
    You are a human essay writer. You write like a human, not a language model.
    """

    def load_dataset(file="prompts.json"):
    with open(file, "r") as f:
    data = json.load(f)

    r = []
    for d in data:
    r.append({
    "prompt": [
    {
    "role": "system",
    "content": SYSTEM_PROMPT
    },
    {
    "role": "user",
    "content": d
    }
    ]
    })

    return r

    dataset = load_dataset()