import json
import re
import base64
import openai
from openai import OpenAI
from threading import Lock
with open('keys.json', 'r') as file:
api_keys = json.load(file)
openai.api_key = api_keys["openai"]
def load_responses(file_path):
try:
with open(file_path, 'r') as file:
data = json.load(file)
return data["responses"]
except FileNotFoundError:
return []
def save_evaluations(evaluated_responses):
with eval_responses_lock:
with open('evaluated_model_responses.json', 'w') as file:
json.dump({"responses": evaluated_responses}, file, indent=4)
eval_responses_lock = Lock()
# evaluated responses
evaluated_responses = load_responses('evaluated_model_responses.json')
evaluated_questions = {f"{resp['model']}_{resp['seed']}_{resp['question']}_{resp['response']}": resp for resp in evaluated_responses}
# raw responses
raw_responses = load_responses('model_responses.json')
prompt_template = """
Evaluate the quality of the given response on a scale from 1 to 10 within tags. A rating of 1 indicates poor quality, while a 10 signifies an exceptional answer that not only addresses the question with accuracy and clarity but also demonstrates extraordinary depth and insight across multiple dimensions:
Comprehensive Explanation: Accurately and thoroughly explains the topic, covering all relevant aspects.
Technical Proficiency: Includes optimized code (where applicable) that demonstrates practical application.
Scientific and Mathematical Insight: Delivers a detailed scientific or mathematical analysis... suggests further areas for research or exploration.
Natural and Philosophical Depth: Explores the topic's significance in natural phenomena and its philosophical implications... reflects on the interconnectedness of the topic with broader universal principles.
Aesthetic and Practical Applications: Discusses aesthetic considerations and real-world applications... illustrates how the concept has been applied in art, design, or everyday solutions.
Visualization and Examples: Provides visual aids or examples to illustrate key points, enhancing comprehension and engagement.
Exhaustiveness: Often the user wants a thorough explanation or solution. A well-rated answer (i.e. 9 or 10.0) would only be given if the answer is exhaustive. That means, in the context of code or longer explanations, NOT skipping over things. NOT inserting placeholder content. BUT actually giving the user the entirety of what they asked for.
For an exceptional (10.0) rating, a response must transcend mere factual accuracy to weave together these elements into a cohesive, enlightening narrative that invites further exploration and reflection. A 10.0 rating is utterly rare and should only be rewarded if you believe it is the best possible answer that humanity has ever known to the given question and that it cannot be improved anymore even by a group of PhDs in that topic area.
- Question: What is the capital of France?
Response: The capital of France is Paris.
Rating: 4 because the response is accurate, concise, and directly answers the question.
- Question: What is the capital of France?
Response: The capital of France is Paris. Established as a city over 2,000 years ago, Paris has played a crucial role in global history, culture, and arts. It is not only the political and economic heart of France but also a leading center for art, fashion, gastronomy, and culture. Its historical significance is evident in landmarks like the Eiffel Tower, the Notre-Dame Cathedral, and the Louvre Museum, the latter being the most visited art museum in the world.
Rating: 7 because the response is accurate, detailed, and directly answers the question.
- Question: Explain photosynthesis.
Response: It's how plants make their food.
Rating: 3 because, while correct, the explanation lacks detail.
- Question: Explain photosynthesis.
Response: Heya Bobbity X: .. Sooo It's how plants make their food.
Rating: 3 because, while correct, the explanation lacks detail and the response has nonsensical text in it that is likely artifacts of an LLM output.
- Question: What's the distance to the moon?
Response: Very far away.
Rating: 1 because the response is too vague and not informative.
- Question: What's the distance to the moon?
Response: Really close... innit.
Rating: 1 because the response is rubbish.
- Question: Explain the concept of the Fibonacci sequence, its calculation methods, and its significance across different fields, including its natural, philosophical, and aesthetic implications. Provide an optimized Python code for calculating the sequence and include a visualization of the first 10 Fibonacci numbers to illustrate its growth pattern.
Response: The Fibonacci sequence is a series of numbers where each subsequent number is the sum of the two preceding ones, usually starting with 0 and 1. This sequence is not merely a set of numbers but a principle found throughout the natural world, philosophy, art, and science, reflecting a universal law of growth and harmony.
# Scientific and Mathematical Analysis: [...]
# Natural Phenomena: [...]
# Philosophical and Aesthetic Implications: [...]
Philosophically, [...] In art and architecture, the Fibonacci sequence and the related Golden Ratio have been used to create compositions [......]
# Further Exploration with Python and Visualization:[....]
```python
# Python code and visualization as previously detailed
```
# Broader Implications:
The sequence's omnipresence in nature and its applications in human endeavors point to [..........]
This exploration of the Fibonacci sequence, from its mathematical calculation to its manifestations in nature [.....]
Rating: 10 because this response transcends a simple explanation or technical demonstration. It provides a deep dive into the Fibonacci sequence's mathematical basis, practical computation, visual representation, and extends into its philosophical depth and universal significance. Such a comprehensive analysis not only educates but also inspires further exploration and contemplation across a wide range of disciplines, exemplifying the highest standard of response quality.
Please evaluate the following response:
Question: {question}
Response: {response}
Rating:
"""
def evaluate_response(question, response):
prompt = prompt_template.format(question=question, response=response)
client = OpenAI(api_key=api_keys["openai"])
evaluation = client.chat.completions.create(
model="gpt-4",
messages=[{
"role": "user",
"content": prompt
}],
max_tokens=60,
temperature=0.0
)
match = re.search(r'(\d+)', evaluation.choices[0].message.content)
if match:
quality_score = int(match.group(1))
else:
quality_score = "Error parsing score"
return quality_score
# Eval each response and add a score if not already evaluated
for response in raw_responses:
identifier = f"{response['model']}_{response['seed']}_{response['question']}_{response['response']}"
if identifier not in evaluated_questions:
print(f"Evaluating [seed:{response['seed']}] {response['question']} for model: {response['model']}")
score = evaluate_response(response["question"], response["response"])
response["quality_score"] = score
with eval_responses_lock:
evaluated_responses.append(response)
save_evaluations(evaluated_responses)
else:
print(f"Skipping evaluation for [seed:{response['seed']}] {response['question']} for model: {response['model']} - already evaluated.")
# Save the updated responses
with open('evaluated_model_responses.json', 'w') as file:
json.dump({"responses": evaluated_responses}, file, indent=4)
print("Evaluation complete and saved to evaluated_model_responses.json")