SebastianBodza · December 23, 2024 23:35
diff --git a/parse_ilias.py b/parse_ilias.py
 import xml.etree.ElementTree as ET
 import pandas as pd
 import base64
 import re

 def clean_text(text):
    if text is None:
        return ""
    text = re.sub(r'<[^>]+>', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

 def decode_base64_field(field_entry):
    try:
        return base64.b64decode(field_entry).decode('utf-8')
    except:
        return None

 def parse_question_data(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    questions_data = []
    
    for item in root.findall('.//item'):
        question_id = item.get('ident', '')
        title = item.get('title', '')
        
        # Get metadata fields
        metadata = item.find('.//itemmetadata/qtimetadata')
        meta_dict = {}
        if metadata:
            for field in metadata.findall('qtimetadatafield'):
                label = field.find('fieldlabel').text
                entry = field.find('fieldentry').text
                
                if label in ['options', 'inputs', 'prts', 'extra_info']:
                    decoded_entry = decode_base64_field(entry)
                    if decoded_entry:
                        meta_dict[f'{label}_decoded'] = decoded_entry
                
                meta_dict[label] = entry
        
        # Process options
        options_decoded = meta_dict.get('options_decoded', '')
        if options_decoded:
            vars_match = re.search(r'question_variables";s:\d+:"(.*?)";', options_decoded, re.DOTALL)
            question_vars = vars_match.group(1) if vars_match else ""
            question_vars = question_vars.replace('\\n', '\n').replace('\\', '')
            
            # Extract additional options settings
            settings = {
                'multiplication_sign': re.search(r'multiplication_sign";s:\d+:"(.*?)"', options_decoded),
                'simplify': re.search(r'question_simplify";i:(\d+)', options_decoded),
                'assume_positive': re.search(r'assume_positive";i:(\d+)', options_decoded),
                'complex_numbers': re.search(r'complex_numbers";s:\d+:"(.*?)"', options_decoded),
                'inverse_trig': re.search(r'inverse_trig";s:\d+:"(.*?)"', options_decoded)
            }
            settings = {k: v.group(1) if v else None for k, v in settings.items()}
        else:
            question_vars = ""
            settings = {}

        # Process inputs
        inputs_decoded = meta_dict.get('inputs_decoded', '')
        if inputs_decoded:
            input_fields = []
            for input_match in re.finditer(r'input_name";s:\d+:"(.*?)".*?teacher_answer";s:\d+:"(.*?)"', inputs_decoded, re.DOTALL):
                input_fields.append({
                    'name': input_match.group(1),
                    'expected_answer': input_match.group(2)
                })
        else:
            input_fields = []

        # Process PRTs (feedback and scoring)
        prts_decoded = meta_dict.get('prts_decoded', '')
        if prts_decoded:
            feedback_matches = re.finditer(r'feedback";s:\d+:"(.*?)"', prts_decoded)
            feedback_texts = [m.group(1) for m in feedback_matches]
            score_matches = re.finditer(r'score";s:\d+:"(.*?)"', prts_decoded)
            scores = [m.group(1) for m in score_matches]
        else:
            feedback_texts = []
            scores = []

        # Get question text
        presentation = item.find('.//presentation')
        question_texts = presentation.findall('.//mattext[@texttype="text/xhtml"]')
        question_text = question_texts[0].text if question_texts else ""
        
        # Create a prompt suitable for LLM testing
        prompt = f"""Given the following mathematical function and its context:

 Question: {clean_text(question_text)}

 Function variables:
 {question_vars}

 Your task is to:
 1. Explain the mathematical concept being tested
 2. Show how to solve this type problem step by step
 3. Provide the general formula for the solution

 Please format your response in a clear, educational manner."""

        questions_data.append({
            'question_id': question_id,
            'title': title,
            'author': meta_dict.get('AUTHOR', ''),
            'points': meta_dict.get('POINTS', ''),
            'question_type': meta_dict.get('QUESTIONTYPE', ''),
            'original_question': question_text,
            'question_variables': question_vars,
            'prompt': prompt,
            'settings': settings,
            'input_fields': input_fields,
            'feedback_rules': feedback_texts,
            'possible_scores': scores,
        })
    
    return pd.DataFrame(questions_data)

 # Create the dataset
 df = parse_question_data('1612776863__0__qti_27417.xml')

 # print("\nSample Question Format:")
 # print("=" * 80)
 # for column in ['question_id', 'title', 'prompt', 'settings', 'input_fields', 'feedback_rules', 'possible_scores']:
 #     print(f"\n{column.upper()}:")
 #     print(df[column].iloc[-10])
 #     print("-" * 40)

 # Save to CSV for LLM testing
 df.to_csv('math_questions_for_llm.csv', index=False)
 df


 ##################################################

 NON WORKING MAXIMA

 ##################################################

 import re
 import subprocess
 import time

 import re
 import subprocess
 import time
 class MaximaSession:
    def __init__(self):
        self.maxima_executable = 'maxima'
        self.process = None
        self.initialize_maxima()

    def initialize_maxima(self):
        try:
            # Start maxima process with --very-quiet flag to reduce output noise
            self.process = subprocess.Popen(
                [self.maxima_executable, '--very-quiet'],
                stdin=subprocess.PIPE,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                text=True,
                bufsize=1  # Line buffered
            )

            # Initialize with required settings
            init_commands = """
            display2d:false$
            keepfloat:true$
            file_search_lisp:append(["./maxima/*.lisp"],file_search_lisp)$
            load("./maxima/stackmaxima.mac")$
            """
            self.process.stdin.write(init_commands)
            self.process.stdin.flush()
            
            # Read initial output
            self._read_until_prompt()
            print("Maxima initialized successfully")
            
        except Exception as e:
            print(f"Initialization error: {e}")
            self.close()
            raise e

    def _read_until_prompt(self):
        """Read output until we find the input prompt."""
        output = []
        while True:
            line = self.process.stdout.readline()
            if not line:
                break
            line = line.strip()
            if line.startswith('(%i'):  # Input prompt
                return '\n'.join(output)
            if line and not line.startswith('(%'):  # Ignore prompts
                output.append(line)
        return '\n'.join(output)

    def send_command(self, command: str) -> str:
        try:
            # Ensure command ends with semicolon
            if not command.strip().endswith(';'):
                command = command.strip() + ';'
            
            # Write command and flush
            self.process.stdin.write(command + '\n')
            self.process.stdin.flush()
            
            # Read output until next prompt
            result = self._read_until_prompt()
            
            # Clean up the result
            if result:
                # Remove any remaining prompts or labels
                result = re.sub(r'\(%[io][0-9]+\)\s*', '', result)
                result = result.strip()
            
            return result

        except Exception as e:
            print(f"Command error: {e}")
            return None

    def close(self):
        if self.process:
            try:
                self.process.stdin.write("quit();\n")
                self.process.stdin.flush()
                self.process.terminate()
                self.process.wait(timeout=2)
            except:
                self.process.kill()  # Force kill if terminate doesn't work
            finally:
                self.process = None

 def evaluate_variables(var_dict, maxima):
    """
    Recursively evaluate variables that might depend on each other using Maxima
    """
    evaluated = {}
    
    def evaluate_var(var_name):
        # If already evaluated, return the value
        if var_name in evaluated:
            return evaluated[var_name]
        
        # Get the expression to evaluate
        expression = var_dict[var_name]
        
        # Find any variables in the expression that need to be evaluated first
        deps = re.findall(r'([a-zA-Z_][\w]*)', expression)
        
        # Evaluate dependencies first
        for dep in deps:
            if dep in var_dict and dep not in evaluated:
                evaluate_var(dep)
                # Replace the dependency with its evaluated value in the expression
                expression = re.sub(r'\b' + dep + r'\b', evaluated[dep], expression)
        
        # Evaluate the expression using Maxima
        result = maxima.send_command(expression)
        evaluated[var_name] = result
        return result

    # Evaluate all variables
    for var_name in var_dict:
        evaluate_var(var_name)
    
    return evaluated

 maxima = MaximaSession()

 clean = clean_text(df["original_question"].iloc[0])

 variables = re.findall(r'@(.*?)@', clean)

 question_variables = df["question_variables"].iloc[0]

 var_dict = {}
 pattern = r'([a-zA-Z_][\w\(\)]*)\s*(?:\:=|\:)\s*(.*?);'

 for match in re.findall(pattern, question_variables, re.DOTALL):
    var_name, var_value = match
    var_value = var_value.strip()
    var_dict[var_name] = var_value

 # The variables could depend on each other
 evaluated_vars = evaluate_variables(var_dict, maxima)


    # try:
    #     var_dict[var_name] = maxima.send_command(var_value)
    # except Exception as e:
    #     print(f"Error evaluating variable {var_name}: {e}")
    #     var_dict[var_name] = var_value

 print(evaluated_vars)

 for n in range(2):
    for var in evaluated_vars:
        if var in evaluated_vars:
            clean = clean.replace(f"@{var}@", evaluated_vars[var])
            clean = clean.replace(f"{var}", evaluated_vars[var])
        else:
            clean = clean.replace(f"@{var}@", "@@")



 print(clean)
	import xml.etree.ElementTree as ET
	import pandas as pd
	import base64
	import re

	def clean_text(text):
	if text is None:
	return ""
	text = re.sub(r'<[^>]+>', ' ', text)
	text = re.sub(r'\s+', ' ', text)
	return text.strip()

	def decode_base64_field(field_entry):
	try:
	return base64.b64decode(field_entry).decode('utf-8')
	except:
	return None

	def parse_question_data(xml_file):
	tree = ET.parse(xml_file)
	root = tree.getroot()
	questions_data = []

	for item in root.findall('.//item'):
	question_id = item.get('ident', '')
	title = item.get('title', '')

	# Get metadata fields
	metadata = item.find('.//itemmetadata/qtimetadata')
	meta_dict = {}
	if metadata:
	for field in metadata.findall('qtimetadatafield'):
	label = field.find('fieldlabel').text
	entry = field.find('fieldentry').text

	if label in ['options', 'inputs', 'prts', 'extra_info']:
	decoded_entry = decode_base64_field(entry)
	if decoded_entry:
	meta_dict[f'{label}_decoded'] = decoded_entry

	meta_dict[label] = entry

	# Process options
	options_decoded = meta_dict.get('options_decoded', '')
	if options_decoded:
	vars_match = re.search(r'question_variables";s:\d+:"(.*?)";', options_decoded, re.DOTALL)
	question_vars = vars_match.group(1) if vars_match else ""
	question_vars = question_vars.replace('\\n', '\n').replace('\\', '')

	# Extract additional options settings
	settings = {
	'multiplication_sign': re.search(r'multiplication_sign";s:\d+:"(.*?)"', options_decoded),
	'simplify': re.search(r'question_simplify";i:(\d+)', options_decoded),
	'assume_positive': re.search(r'assume_positive";i:(\d+)', options_decoded),
	'complex_numbers': re.search(r'complex_numbers";s:\d+:"(.*?)"', options_decoded),
	'inverse_trig': re.search(r'inverse_trig";s:\d+:"(.*?)"', options_decoded)
	}
	settings = {k: v.group(1) if v else None for k, v in settings.items()}
	else:
	question_vars = ""
	settings = {}

	# Process inputs
	inputs_decoded = meta_dict.get('inputs_decoded', '')
	if inputs_decoded:
	input_fields = []
	for input_match in re.finditer(r'input_name";s:\d+:"(.?)".?teacher_answer";s:\d+:"(.*?)"', inputs_decoded, re.DOTALL):
	input_fields.append({
	'name': input_match.group(1),
	'expected_answer': input_match.group(2)
	})
	else:
	input_fields = []

	# Process PRTs (feedback and scoring)
	prts_decoded = meta_dict.get('prts_decoded', '')
	if prts_decoded:
	feedback_matches = re.finditer(r'feedback";s:\d+:"(.*?)"', prts_decoded)
	feedback_texts = [m.group(1) for m in feedback_matches]
	score_matches = re.finditer(r'score";s:\d+:"(.*?)"', prts_decoded)
	scores = [m.group(1) for m in score_matches]
	else:
	feedback_texts = []
	scores = []

	# Get question text
	presentation = item.find('.//presentation')
	question_texts = presentation.findall('.//mattext[@texttype="text/xhtml"]')
	question_text = question_texts[0].text if question_texts else ""

	# Create a prompt suitable for LLM testing
	prompt = f"""Given the following mathematical function and its context:

	Question: {clean_text(question_text)}

	Function variables:
	{question_vars}

	Your task is to:
	1. Explain the mathematical concept being tested
	2. Show how to solve this type problem step by step
	3. Provide the general formula for the solution

	Please format your response in a clear, educational manner."""

	questions_data.append({
	'question_id': question_id,
	'title': title,
	'author': meta_dict.get('AUTHOR', ''),
	'points': meta_dict.get('POINTS', ''),
	'question_type': meta_dict.get('QUESTIONTYPE', ''),
	'original_question': question_text,
	'question_variables': question_vars,
	'prompt': prompt,
	'settings': settings,
	'input_fields': input_fields,
	'feedback_rules': feedback_texts,
	'possible_scores': scores,
	})

	return pd.DataFrame(questions_data)

	# Create the dataset
	df = parse_question_data('1612776863__0__qti_27417.xml')

	# print("\nSample Question Format:")
	# print("=" * 80)
	# for column in ['question_id', 'title', 'prompt', 'settings', 'input_fields', 'feedback_rules', 'possible_scores']:
	# print(f"\n{column.upper()}:")
	# print(df[column].iloc[-10])
	# print("-" * 40)

	# Save to CSV for LLM testing
	df.to_csv('math_questions_for_llm.csv', index=False)
	df


	##################################################

	NON WORKING MAXIMA

	##################################################

	import re
	import subprocess
	import time

	import re
	import subprocess
	import time
	class MaximaSession:
	def __init__(self):
	self.maxima_executable = 'maxima'
	self.process = None
	self.initialize_maxima()

	def initialize_maxima(self):
	try:
	# Start maxima process with --very-quiet flag to reduce output noise
	self.process = subprocess.Popen(
	[self.maxima_executable, '--very-quiet'],
	stdin=subprocess.PIPE,
	stdout=subprocess.PIPE,
	stderr=subprocess.PIPE,
	text=True,
	bufsize=1 # Line buffered
	)

	# Initialize with required settings
	init_commands = """
	display2d:false$
	keepfloat:true$
	file_search_lisp:append(["./maxima/*.lisp"],file_search_lisp)$
	load("./maxima/stackmaxima.mac")$
	"""
	self.process.stdin.write(init_commands)
	self.process.stdin.flush()

	# Read initial output
	self._read_until_prompt()
	print("Maxima initialized successfully")

	except Exception as e:
	print(f"Initialization error: {e}")
	self.close()
	raise e

	def _read_until_prompt(self):
	"""Read output until we find the input prompt."""
	output = []
	while True:
	line = self.process.stdout.readline()
	if not line:
	break
	line = line.strip()
	if line.startswith('(%i'): # Input prompt
	return '\n'.join(output)
	if line and not line.startswith('(%'): # Ignore prompts
	output.append(line)
	return '\n'.join(output)

	def send_command(self, command: str) -> str:
	try:
	# Ensure command ends with semicolon
	if not command.strip().endswith(';'):
	command = command.strip() + ';'

	# Write command and flush
	self.process.stdin.write(command + '\n')
	self.process.stdin.flush()

	# Read output until next prompt
	result = self._read_until_prompt()

	# Clean up the result
	if result:
	# Remove any remaining prompts or labels
	result = re.sub(r'\(%[io][0-9]+\)\s*', '', result)
	result = result.strip()

	return result

	except Exception as e:
	print(f"Command error: {e}")
	return None

	def close(self):
	if self.process:
	try:
	self.process.stdin.write("quit();\n")
	self.process.stdin.flush()
	self.process.terminate()
	self.process.wait(timeout=2)
	except:
	self.process.kill() # Force kill if terminate doesn't work
	finally:
	self.process = None

	def evaluate_variables(var_dict, maxima):
	"""
	Recursively evaluate variables that might depend on each other using Maxima
	"""
	evaluated = {}

	def evaluate_var(var_name):
	# If already evaluated, return the value
	if var_name in evaluated:
	return evaluated[var_name]

	# Get the expression to evaluate
	expression = var_dict[var_name]

	# Find any variables in the expression that need to be evaluated first
	deps = re.findall(r'([a-zA-Z_][\w]*)', expression)

	# Evaluate dependencies first
	for dep in deps:
	if dep in var_dict and dep not in evaluated:
	evaluate_var(dep)
	# Replace the dependency with its evaluated value in the expression
	expression = re.sub(r'\b' + dep + r'\b', evaluated[dep], expression)

	# Evaluate the expression using Maxima
	result = maxima.send_command(expression)
	evaluated[var_name] = result
	return result

	# Evaluate all variables
	for var_name in var_dict:
	evaluate_var(var_name)

	return evaluated

	maxima = MaximaSession()

	clean = clean_text(df["original_question"].iloc[0])

	variables = re.findall(r'@(.*?)@', clean)

	question_variables = df["question_variables"].iloc[0]

	var_dict = {}
	pattern = r'([a-zA-Z_][\w\(\)])\s(?:\:=\|\:)\s(.?);'

	for match in re.findall(pattern, question_variables, re.DOTALL):
	var_name, var_value = match
	var_value = var_value.strip()
	var_dict[var_name] = var_value

	# The variables could depend on each other
	evaluated_vars = evaluate_variables(var_dict, maxima)


	# try:
	# var_dict[var_name] = maxima.send_command(var_value)
	# except Exception as e:
	# print(f"Error evaluating variable {var_name}: {e}")
	# var_dict[var_name] = var_value

	print(evaluated_vars)

	for n in range(2):
	for var in evaluated_vars:
	if var in evaluated_vars:
	clean = clean.replace(f"@{var}@", evaluated_vars[var])
	clean = clean.replace(f"{var}", evaluated_vars[var])
	else:
	clean = clean.replace(f"@{var}@", "@@")



	print(clean)