Created
June 30, 2025 08:52
-
-
Save ssghost/c9d696dab3fe4eccb624c9a989db3803 to your computer and use it in GitHub Desktop.
Revisions
-
ssghost created this gist
Jun 30, 2025 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,202 @@ import requests from collections import OrderedDict import logging import re import json from time import sleep def count_xml(text) -> float: count = 0.0 if text.count("<reasoning>\n") == 1: count += 0.125 if text.count("\n</reasoning>\n") == 1: count += 0.125 if text.count("\n<answer>\n") == 1: count += 0.125 count -= len(text.split("\n</answer>\n")[-1])*0.001 if text.count("\n</answer>") == 1: count += 0.125 count -= (len(text.split("\n</answer>")[-1]) - 1)*0.001 return count def get_score(text: str) -> float: base_url = "https://api.zerogpt.com/api/detect/detectText" headers = OrderedDict(( ("Host", "api.zerogpt.com"), ("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:135.0) Gecko/20100101 Firefox/135.0"), ("Accept", "application/json, text/plain, */*"), ("Accept-Language", "en-US,en;q=0.5"), ("Accept-Encoding", "gzip, deflate, br, zstd"), ("Content-Type", "application/json"), ("Origin", "https://www.zerogpt.com"), ("Connection", "keep-alive"), ("Referer", "https://www.zerogpt.com/"), ("Sec-Fetch-Dest", "empty"), ("Sec-Fetch-Mode", "cors"), ("Sec-Fetch-Site", "same-site"), ("Priority", "u=0"), ("Pragma", "no-cache"), ("Cache-Control", "no-cache") )) data = { "input_text": text, } r = requests.post(base_url, headers=headers, json=data) #return r.json() j = r.json() if j['code'] != 200: logging.error(f"Failed to get score. Response: {j}") sleep(1) return 0.0 # Not ideal, but we need to return a float in all cases fake_percentage = j["data"]["fakePercentage"] return 100.0 - fake_percentage # Penalize for highly fake content def extract_answer_content(text: str) -> str | None: """Extract just the content between <answer> tags""" try: answer = text.split("<answer>")[-1] answer = answer.split("</answer>")[0] return answer.strip() except: return None def extract_reasoning_content(text: str) -> str | None: """Extract just the content between <reasoning> tags""" try: reasoning = text.split("<reasoning>")[-1] reasoning = reasoning.split("</reasoning>")[0] return reasoning.strip() except: return None def tiered_reward_func(completions, **kwargs) -> list[float]: """ Tiered reward function for ZeroGPT. Rewards are broken down into multiple tiers """ responses = [completion[0]["content"] for completion in completions] rewards = [] strict_pattern = r"^<reasoning>\n.*?\n</reasoning>\n<answer>\n.*?\n</answer>$" for response in responses: reward = 0.0 response = response.strip() # Tier 0: Individual tag rewards if response.count("<reasoning>") == 1: reward += 0.1 if response.count("</reasoning>") == 1: reward += 0.1 if response.count("<answer>") == 1: reward += 0.1 if response.count("</answer>") == 1: reward += 0.1 # Tier 1: Format checking soft_pattern = r"<reasoning>.*?</reasoning>\s*<answer>.*?</answer>" if re.search(soft_pattern, response, re.DOTALL): reward += 0.25 xml_score = count_xml(response) reward += xml_score strict_match = re.match(strict_pattern, response) # Tier 2: Only add ZeroGPT score if strict format passes if strict_match: reward += 0.5 # Extract just the answer content for check answer_content = extract_answer_content(response) if answer_content: reward += get_score(answer_content) / 5.0 # Normalize to 1-20 range reasoning_content = extract_reasoning_content(response) if reasoning_content: reasoning_len = len(reasoning_content) reasoning_len = min(reasoning_len, 500) reward += 0.005 * reasoning_len # Max out at reward of +2.5 rewards.append(reward) # Print the response with the highest reward best_idx = rewards.index(max(rewards)) print(f"\nBest response (reward: {rewards[best_idx]:.3f}):\n{responses[best_idx]}\n{'-'*40}") return rewards from transformers import pipeline selected_model = "madhurjindal/autonlp-Gibberish-Detector-492513457" classifier = pipeline("text-classification", model=selected_model) #classifier("I love Machine Learning!", top_k=100) # Returns: #[{'label': 'clean', 'score': 0.713941216468811}, {'label': 'mild gibberish', 'score': 0.1742146760225296}, {'label': 'word salad', 'score': 0.08730163425207138}, {'label': 'noise', 'score': 0.02454242669045925}] def get_gibberish_score(text: str) -> float: lines = text.split("\n") results = [] for line in lines: line = line.strip() if line: # Skip empty lines clf_response = classifier(line, top_k=100) # Find the 'clean' score clean_score = 0.0 for r in clf_response: if r['label'] == 'clean': clean_score = r['score'] break results.append(clean_score) # Return the average clean score #return sum(results) / len(results) # Model learned to cheat this, so instead we'll return the minimum clean score #return min(results) # Apply an extra harsh penalty for gibberish by returning the square of the minimum clean score return min(results) ** 2 def only_zerogpt_reward_func(completions, **kwargs) -> list[float]: """ Reward function for ZeroGPT. Only uses ZeroGPT API to score completions """ responses = [completion[0]["content"] for completion in completions] rewards = [get_score(response) for response in responses] gibberish_multipliers = [get_gibberish_score(response) for response in responses] # Multiply the rewards by the gibberish multipliers rewards = [r * g for r, g in zip(rewards, gibberish_multipliers)] # Print the response with the highest reward best_idx = rewards.index(max(rewards)) print(f"\nBest response (reward: {rewards[best_idx]:.3f}):\n{responses[best_idx]}\n{'-'*40}") return rewards SYSTEM_PROMPT = """ You are a human essay writer. You write like a human, not a language model. """ def load_dataset(file="prompts.json"): with open(file, "r") as f: data = json.load(f) r = [] for d in data: r.append({ "prompt": [ { "role": "system", "content": SYSTEM_PROMPT }, { "role": "user", "content": d } ] }) return r dataset = load_dataset()