import re from typing import Any, Tuple, Union """A very rough JSON parser. Implementing the standard outlined in https://www.json.org/json-en.html Number parsing is handled by Python. Usage: >>> data, _ = parse_json('{"key": ["value", -1e20, true, false, null]}') >>> assert data == {"key": ["value", -1e20, True, False, None]} """ whitespace_re = re.compile("^[ \t\n\r]+") float_re = re.compile(r"-?\d+(?:\.\d+)?(?:[Ee][+-]?\d+)?") single_char_escape = {"\\\\": "\\", "\\/": "/", '\\"': '"', "\\b": "\b", "\\f": "\f", "\\n": "\n", "\\r": "\r", "\\t": "\t"} plain_str_content_re = re.compile(r"([^\\\"]|\n\r\t)+") def leading_whitespaces(data: str) -> int: match = whitespace_re.match(data) if not match: return 0 return len(match[0]) def parse_string(data: str) -> Tuple[str, int]: assert data[0] == '"' result = "" ptr = 1 while ptr < len(data): if data[ptr] == "\\": # Escape sequence if data[ptr + 1] == "u": # Unicode escape result += chr(int(data[ptr + 2:ptr + 6], 16)) ptr += 6 else: # Single character escape for i in single_char_escape: if data[ptr:ptr+2] == i: result += single_char_escape[i] ptr += 2 break elif data[ptr] == '"': # End of string ptr += 1 break else: # Plain string match = plain_str_content_re.match(data[ptr:]) result += match[0] ptr += len(match[0]) return result, ptr def pares_number(s: str) -> Tuple[Union[float, int], int]: match = float_re.match(s)[0] if "e" not in match and "E" not in match and "." not in match: return int(match), len(match) return float(match), len(match) def parse_object(data: str) -> Tuple[dict, int]: assert data[0] == "{" result = {} ptr = 1 while ptr < len(data): ptr += leading_whitespaces(data[ptr:]) # Empty object if data[ptr] == "}": ptr += 1 break key, proc_len = parse_string(data[ptr:]) ptr += proc_len ptr += leading_whitespaces(data[ptr:]) assert data[ptr] == ":" ptr += 1 value, proc_len = parse_json(data[ptr:]) ptr += proc_len ptr += leading_whitespaces(data[ptr:]) result[key] = value if data[ptr] == "}": ptr += 1 break assert data[ptr] == "," ptr += 1 return result, ptr def parse_array(data: str) -> Tuple[list, int]: assert data[0] == "[" result = [] ptr = 1 while ptr < len(data): ptr += leading_whitespaces(data[ptr:]) # Empty array if data[ptr] == "]": ptr += 1 break value, proc_len = parse_json(data[ptr:]) ptr += proc_len result.append(value) ptr += leading_whitespaces(data[ptr:]) if data[ptr] == "]": ptr += 1 break assert data[ptr] == "," ptr += 1 return result, ptr def parse_json(data: str) -> Tuple[Any, int]: """Returns: parsed value and the number of characters consumed.""" ptr = leading_whitespaces(data) if ptr == len(data): return None, ptr if data[ptr] == "{": val, proc_chr = parse_object(data[ptr:]) return val, ptr + proc_chr elif data[ptr] == "[": val, proc_chr = parse_array(data[ptr:]) return val, ptr + proc_chr elif data[ptr] == '"': val, proc_chr = parse_string(data[ptr:]) return val, ptr + proc_chr elif data[ptr:].startswith("true"): return True, ptr + 4 elif data[ptr:].startswith("false"): return False, ptr + 5 elif data[ptr:].startswith("null"): return None, ptr + 4 elif data[ptr] in "-+0123456789": val, proc_chr = pares_number(data[ptr:]) return val, ptr + proc_chr else: raise ValueError(f"Unexpected character: {data[ptr:]}")