Last active
December 23, 2024 23:35
-
-
Save SebastianBodza/1e32205e8bd76c23e205ec2dc1551fde to your computer and use it in GitHub Desktop.
parse ilias multiple choice
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import xml.etree.ElementTree as ET | |
| import pandas as pd | |
| import base64 | |
| import re | |
| def clean_text(text): | |
| if text is None: | |
| return "" | |
| text = re.sub(r'<[^>]+>', ' ', text) | |
| text = re.sub(r'\s+', ' ', text) | |
| return text.strip() | |
| def decode_base64_field(field_entry): | |
| try: | |
| return base64.b64decode(field_entry).decode('utf-8') | |
| except: | |
| return None | |
| def parse_question_data(xml_file): | |
| tree = ET.parse(xml_file) | |
| root = tree.getroot() | |
| questions_data = [] | |
| for item in root.findall('.//item'): | |
| question_id = item.get('ident', '') | |
| title = item.get('title', '') | |
| # Get metadata fields | |
| metadata = item.find('.//itemmetadata/qtimetadata') | |
| meta_dict = {} | |
| if metadata: | |
| for field in metadata.findall('qtimetadatafield'): | |
| label = field.find('fieldlabel').text | |
| entry = field.find('fieldentry').text | |
| if label in ['options', 'inputs', 'prts', 'extra_info']: | |
| decoded_entry = decode_base64_field(entry) | |
| if decoded_entry: | |
| meta_dict[f'{label}_decoded'] = decoded_entry | |
| meta_dict[label] = entry | |
| # Process options | |
| options_decoded = meta_dict.get('options_decoded', '') | |
| if options_decoded: | |
| vars_match = re.search(r'question_variables";s:\d+:"(.*?)";', options_decoded, re.DOTALL) | |
| question_vars = vars_match.group(1) if vars_match else "" | |
| question_vars = question_vars.replace('\\n', '\n').replace('\\', '') | |
| # Extract additional options settings | |
| settings = { | |
| 'multiplication_sign': re.search(r'multiplication_sign";s:\d+:"(.*?)"', options_decoded), | |
| 'simplify': re.search(r'question_simplify";i:(\d+)', options_decoded), | |
| 'assume_positive': re.search(r'assume_positive";i:(\d+)', options_decoded), | |
| 'complex_numbers': re.search(r'complex_numbers";s:\d+:"(.*?)"', options_decoded), | |
| 'inverse_trig': re.search(r'inverse_trig";s:\d+:"(.*?)"', options_decoded) | |
| } | |
| settings = {k: v.group(1) if v else None for k, v in settings.items()} | |
| else: | |
| question_vars = "" | |
| settings = {} | |
| # Process inputs | |
| inputs_decoded = meta_dict.get('inputs_decoded', '') | |
| if inputs_decoded: | |
| input_fields = [] | |
| for input_match in re.finditer(r'input_name";s:\d+:"(.*?)".*?teacher_answer";s:\d+:"(.*?)"', inputs_decoded, re.DOTALL): | |
| input_fields.append({ | |
| 'name': input_match.group(1), | |
| 'expected_answer': input_match.group(2) | |
| }) | |
| else: | |
| input_fields = [] | |
| # Process PRTs (feedback and scoring) | |
| prts_decoded = meta_dict.get('prts_decoded', '') | |
| if prts_decoded: | |
| feedback_matches = re.finditer(r'feedback";s:\d+:"(.*?)"', prts_decoded) | |
| feedback_texts = [m.group(1) for m in feedback_matches] | |
| score_matches = re.finditer(r'score";s:\d+:"(.*?)"', prts_decoded) | |
| scores = [m.group(1) for m in score_matches] | |
| else: | |
| feedback_texts = [] | |
| scores = [] | |
| # Get question text | |
| presentation = item.find('.//presentation') | |
| question_texts = presentation.findall('.//mattext[@texttype="text/xhtml"]') | |
| question_text = question_texts[0].text if question_texts else "" | |
| # Create a prompt suitable for LLM testing | |
| prompt = f"""Given the following mathematical function and its context: | |
| Question: {clean_text(question_text)} | |
| Function variables: | |
| {question_vars} | |
| Your task is to: | |
| 1. Explain the mathematical concept being tested | |
| 2. Show how to solve this type problem step by step | |
| 3. Provide the general formula for the solution | |
| Please format your response in a clear, educational manner.""" | |
| questions_data.append({ | |
| 'question_id': question_id, | |
| 'title': title, | |
| 'author': meta_dict.get('AUTHOR', ''), | |
| 'points': meta_dict.get('POINTS', ''), | |
| 'question_type': meta_dict.get('QUESTIONTYPE', ''), | |
| 'original_question': question_text, | |
| 'question_variables': question_vars, | |
| 'prompt': prompt, | |
| 'settings': settings, | |
| 'input_fields': input_fields, | |
| 'feedback_rules': feedback_texts, | |
| 'possible_scores': scores, | |
| }) | |
| return pd.DataFrame(questions_data) | |
| # Create the dataset | |
| df = parse_question_data('1612776863__0__qti_27417.xml') | |
| # print("\nSample Question Format:") | |
| # print("=" * 80) | |
| # for column in ['question_id', 'title', 'prompt', 'settings', 'input_fields', 'feedback_rules', 'possible_scores']: | |
| # print(f"\n{column.upper()}:") | |
| # print(df[column].iloc[-10]) | |
| # print("-" * 40) | |
| # Save to CSV for LLM testing | |
| df.to_csv('math_questions_for_llm.csv', index=False) | |
| df | |
| ################################################## | |
| NON WORKING MAXIMA | |
| ################################################## | |
| import re | |
| import subprocess | |
| import time | |
| import re | |
| import subprocess | |
| import time | |
| class MaximaSession: | |
| def __init__(self): | |
| self.maxima_executable = 'maxima' | |
| self.process = None | |
| self.initialize_maxima() | |
| def initialize_maxima(self): | |
| try: | |
| # Start maxima process with --very-quiet flag to reduce output noise | |
| self.process = subprocess.Popen( | |
| [self.maxima_executable, '--very-quiet'], | |
| stdin=subprocess.PIPE, | |
| stdout=subprocess.PIPE, | |
| stderr=subprocess.PIPE, | |
| text=True, | |
| bufsize=1 # Line buffered | |
| ) | |
| # Initialize with required settings | |
| init_commands = """ | |
| display2d:false$ | |
| keepfloat:true$ | |
| file_search_lisp:append(["./maxima/*.lisp"],file_search_lisp)$ | |
| load("./maxima/stackmaxima.mac")$ | |
| """ | |
| self.process.stdin.write(init_commands) | |
| self.process.stdin.flush() | |
| # Read initial output | |
| self._read_until_prompt() | |
| print("Maxima initialized successfully") | |
| except Exception as e: | |
| print(f"Initialization error: {e}") | |
| self.close() | |
| raise e | |
| def _read_until_prompt(self): | |
| """Read output until we find the input prompt.""" | |
| output = [] | |
| while True: | |
| line = self.process.stdout.readline() | |
| if not line: | |
| break | |
| line = line.strip() | |
| if line.startswith('(%i'): # Input prompt | |
| return '\n'.join(output) | |
| if line and not line.startswith('(%'): # Ignore prompts | |
| output.append(line) | |
| return '\n'.join(output) | |
| def send_command(self, command: str) -> str: | |
| try: | |
| # Ensure command ends with semicolon | |
| if not command.strip().endswith(';'): | |
| command = command.strip() + ';' | |
| # Write command and flush | |
| self.process.stdin.write(command + '\n') | |
| self.process.stdin.flush() | |
| # Read output until next prompt | |
| result = self._read_until_prompt() | |
| # Clean up the result | |
| if result: | |
| # Remove any remaining prompts or labels | |
| result = re.sub(r'\(%[io][0-9]+\)\s*', '', result) | |
| result = result.strip() | |
| return result | |
| except Exception as e: | |
| print(f"Command error: {e}") | |
| return None | |
| def close(self): | |
| if self.process: | |
| try: | |
| self.process.stdin.write("quit();\n") | |
| self.process.stdin.flush() | |
| self.process.terminate() | |
| self.process.wait(timeout=2) | |
| except: | |
| self.process.kill() # Force kill if terminate doesn't work | |
| finally: | |
| self.process = None | |
| def evaluate_variables(var_dict, maxima): | |
| """ | |
| Recursively evaluate variables that might depend on each other using Maxima | |
| """ | |
| evaluated = {} | |
| def evaluate_var(var_name): | |
| # If already evaluated, return the value | |
| if var_name in evaluated: | |
| return evaluated[var_name] | |
| # Get the expression to evaluate | |
| expression = var_dict[var_name] | |
| # Find any variables in the expression that need to be evaluated first | |
| deps = re.findall(r'([a-zA-Z_][\w]*)', expression) | |
| # Evaluate dependencies first | |
| for dep in deps: | |
| if dep in var_dict and dep not in evaluated: | |
| evaluate_var(dep) | |
| # Replace the dependency with its evaluated value in the expression | |
| expression = re.sub(r'\b' + dep + r'\b', evaluated[dep], expression) | |
| # Evaluate the expression using Maxima | |
| result = maxima.send_command(expression) | |
| evaluated[var_name] = result | |
| return result | |
| # Evaluate all variables | |
| for var_name in var_dict: | |
| evaluate_var(var_name) | |
| return evaluated | |
| maxima = MaximaSession() | |
| clean = clean_text(df["original_question"].iloc[0]) | |
| variables = re.findall(r'@(.*?)@', clean) | |
| question_variables = df["question_variables"].iloc[0] | |
| var_dict = {} | |
| pattern = r'([a-zA-Z_][\w\(\)]*)\s*(?:\:=|\:)\s*(.*?);' | |
| for match in re.findall(pattern, question_variables, re.DOTALL): | |
| var_name, var_value = match | |
| var_value = var_value.strip() | |
| var_dict[var_name] = var_value | |
| # The variables could depend on each other | |
| evaluated_vars = evaluate_variables(var_dict, maxima) | |
| # try: | |
| # var_dict[var_name] = maxima.send_command(var_value) | |
| # except Exception as e: | |
| # print(f"Error evaluating variable {var_name}: {e}") | |
| # var_dict[var_name] = var_value | |
| print(evaluated_vars) | |
| for n in range(2): | |
| for var in evaluated_vars: | |
| if var in evaluated_vars: | |
| clean = clean.replace(f"@{var}@", evaluated_vars[var]) | |
| clean = clean.replace(f"{var}", evaluated_vars[var]) | |
| else: | |
| clean = clean.replace(f"@{var}@", "@@") | |
| print(clean) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment