Skip to content

Instantly share code, notes, and snippets.

@SebastianBodza
Last active December 23, 2024 23:35
Show Gist options
  • Save SebastianBodza/1e32205e8bd76c23e205ec2dc1551fde to your computer and use it in GitHub Desktop.
Save SebastianBodza/1e32205e8bd76c23e205ec2dc1551fde to your computer and use it in GitHub Desktop.
parse ilias multiple choice
import xml.etree.ElementTree as ET
import pandas as pd
import base64
import re
def clean_text(text):
if text is None:
return ""
text = re.sub(r'<[^>]+>', ' ', text)
text = re.sub(r'\s+', ' ', text)
return text.strip()
def decode_base64_field(field_entry):
try:
return base64.b64decode(field_entry).decode('utf-8')
except:
return None
def parse_question_data(xml_file):
tree = ET.parse(xml_file)
root = tree.getroot()
questions_data = []
for item in root.findall('.//item'):
question_id = item.get('ident', '')
title = item.get('title', '')
# Get metadata fields
metadata = item.find('.//itemmetadata/qtimetadata')
meta_dict = {}
if metadata:
for field in metadata.findall('qtimetadatafield'):
label = field.find('fieldlabel').text
entry = field.find('fieldentry').text
if label in ['options', 'inputs', 'prts', 'extra_info']:
decoded_entry = decode_base64_field(entry)
if decoded_entry:
meta_dict[f'{label}_decoded'] = decoded_entry
meta_dict[label] = entry
# Process options
options_decoded = meta_dict.get('options_decoded', '')
if options_decoded:
vars_match = re.search(r'question_variables";s:\d+:"(.*?)";', options_decoded, re.DOTALL)
question_vars = vars_match.group(1) if vars_match else ""
question_vars = question_vars.replace('\\n', '\n').replace('\\', '')
# Extract additional options settings
settings = {
'multiplication_sign': re.search(r'multiplication_sign";s:\d+:"(.*?)"', options_decoded),
'simplify': re.search(r'question_simplify";i:(\d+)', options_decoded),
'assume_positive': re.search(r'assume_positive";i:(\d+)', options_decoded),
'complex_numbers': re.search(r'complex_numbers";s:\d+:"(.*?)"', options_decoded),
'inverse_trig': re.search(r'inverse_trig";s:\d+:"(.*?)"', options_decoded)
}
settings = {k: v.group(1) if v else None for k, v in settings.items()}
else:
question_vars = ""
settings = {}
# Process inputs
inputs_decoded = meta_dict.get('inputs_decoded', '')
if inputs_decoded:
input_fields = []
for input_match in re.finditer(r'input_name";s:\d+:"(.*?)".*?teacher_answer";s:\d+:"(.*?)"', inputs_decoded, re.DOTALL):
input_fields.append({
'name': input_match.group(1),
'expected_answer': input_match.group(2)
})
else:
input_fields = []
# Process PRTs (feedback and scoring)
prts_decoded = meta_dict.get('prts_decoded', '')
if prts_decoded:
feedback_matches = re.finditer(r'feedback";s:\d+:"(.*?)"', prts_decoded)
feedback_texts = [m.group(1) for m in feedback_matches]
score_matches = re.finditer(r'score";s:\d+:"(.*?)"', prts_decoded)
scores = [m.group(1) for m in score_matches]
else:
feedback_texts = []
scores = []
# Get question text
presentation = item.find('.//presentation')
question_texts = presentation.findall('.//mattext[@texttype="text/xhtml"]')
question_text = question_texts[0].text if question_texts else ""
# Create a prompt suitable for LLM testing
prompt = f"""Given the following mathematical function and its context:
Question: {clean_text(question_text)}
Function variables:
{question_vars}
Your task is to:
1. Explain the mathematical concept being tested
2. Show how to solve this type problem step by step
3. Provide the general formula for the solution
Please format your response in a clear, educational manner."""
questions_data.append({
'question_id': question_id,
'title': title,
'author': meta_dict.get('AUTHOR', ''),
'points': meta_dict.get('POINTS', ''),
'question_type': meta_dict.get('QUESTIONTYPE', ''),
'original_question': question_text,
'question_variables': question_vars,
'prompt': prompt,
'settings': settings,
'input_fields': input_fields,
'feedback_rules': feedback_texts,
'possible_scores': scores,
})
return pd.DataFrame(questions_data)
# Create the dataset
df = parse_question_data('1612776863__0__qti_27417.xml')
# print("\nSample Question Format:")
# print("=" * 80)
# for column in ['question_id', 'title', 'prompt', 'settings', 'input_fields', 'feedback_rules', 'possible_scores']:
# print(f"\n{column.upper()}:")
# print(df[column].iloc[-10])
# print("-" * 40)
# Save to CSV for LLM testing
df.to_csv('math_questions_for_llm.csv', index=False)
df
##################################################
NON WORKING MAXIMA
##################################################
import re
import subprocess
import time
import re
import subprocess
import time
class MaximaSession:
def __init__(self):
self.maxima_executable = 'maxima'
self.process = None
self.initialize_maxima()
def initialize_maxima(self):
try:
# Start maxima process with --very-quiet flag to reduce output noise
self.process = subprocess.Popen(
[self.maxima_executable, '--very-quiet'],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
bufsize=1 # Line buffered
)
# Initialize with required settings
init_commands = """
display2d:false$
keepfloat:true$
file_search_lisp:append(["./maxima/*.lisp"],file_search_lisp)$
load("./maxima/stackmaxima.mac")$
"""
self.process.stdin.write(init_commands)
self.process.stdin.flush()
# Read initial output
self._read_until_prompt()
print("Maxima initialized successfully")
except Exception as e:
print(f"Initialization error: {e}")
self.close()
raise e
def _read_until_prompt(self):
"""Read output until we find the input prompt."""
output = []
while True:
line = self.process.stdout.readline()
if not line:
break
line = line.strip()
if line.startswith('(%i'): # Input prompt
return '\n'.join(output)
if line and not line.startswith('(%'): # Ignore prompts
output.append(line)
return '\n'.join(output)
def send_command(self, command: str) -> str:
try:
# Ensure command ends with semicolon
if not command.strip().endswith(';'):
command = command.strip() + ';'
# Write command and flush
self.process.stdin.write(command + '\n')
self.process.stdin.flush()
# Read output until next prompt
result = self._read_until_prompt()
# Clean up the result
if result:
# Remove any remaining prompts or labels
result = re.sub(r'\(%[io][0-9]+\)\s*', '', result)
result = result.strip()
return result
except Exception as e:
print(f"Command error: {e}")
return None
def close(self):
if self.process:
try:
self.process.stdin.write("quit();\n")
self.process.stdin.flush()
self.process.terminate()
self.process.wait(timeout=2)
except:
self.process.kill() # Force kill if terminate doesn't work
finally:
self.process = None
def evaluate_variables(var_dict, maxima):
"""
Recursively evaluate variables that might depend on each other using Maxima
"""
evaluated = {}
def evaluate_var(var_name):
# If already evaluated, return the value
if var_name in evaluated:
return evaluated[var_name]
# Get the expression to evaluate
expression = var_dict[var_name]
# Find any variables in the expression that need to be evaluated first
deps = re.findall(r'([a-zA-Z_][\w]*)', expression)
# Evaluate dependencies first
for dep in deps:
if dep in var_dict and dep not in evaluated:
evaluate_var(dep)
# Replace the dependency with its evaluated value in the expression
expression = re.sub(r'\b' + dep + r'\b', evaluated[dep], expression)
# Evaluate the expression using Maxima
result = maxima.send_command(expression)
evaluated[var_name] = result
return result
# Evaluate all variables
for var_name in var_dict:
evaluate_var(var_name)
return evaluated
maxima = MaximaSession()
clean = clean_text(df["original_question"].iloc[0])
variables = re.findall(r'@(.*?)@', clean)
question_variables = df["question_variables"].iloc[0]
var_dict = {}
pattern = r'([a-zA-Z_][\w\(\)]*)\s*(?:\:=|\:)\s*(.*?);'
for match in re.findall(pattern, question_variables, re.DOTALL):
var_name, var_value = match
var_value = var_value.strip()
var_dict[var_name] = var_value
# The variables could depend on each other
evaluated_vars = evaluate_variables(var_dict, maxima)
# try:
# var_dict[var_name] = maxima.send_command(var_value)
# except Exception as e:
# print(f"Error evaluating variable {var_name}: {e}")
# var_dict[var_name] = var_value
print(evaluated_vars)
for n in range(2):
for var in evaluated_vars:
if var in evaluated_vars:
clean = clean.replace(f"@{var}@", evaluated_vars[var])
clean = clean.replace(f"{var}", evaluated_vars[var])
else:
clean = clean.replace(f"@{var}@", "@@")
print(clean)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment