Skip to content

Instantly share code, notes, and snippets.

@tikendraw
Created September 21, 2024 06:44
Show Gist options
  • Save tikendraw/fdffe9fa2bc33e32afe23a69224078d7 to your computer and use it in GitHub Desktop.
Save tikendraw/fdffe9fa2bc33e32afe23a69224078d7 to your computer and use it in GitHub Desktop.

Revisions

  1. tikendraw created this gist Sep 21, 2024.
    80 changes: 80 additions & 0 deletions parse json code blocks for text blobs.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,80 @@

    import re
    import json
    import ast
    from pydantic import ValidationError

    def extract_code_block(text):
    # This regex looks for ```json or ``` followed by { ... } (JSON or dict-like structure)
    code_block = re.findall(r'```(?:json)?\s*({.*?})\s*```', text, re.DOTALL)

    # If a match is found, parse it as a dictionary
    if code_block:
    try:
    # Use ast.literal_eval to safely evaluate the dictionary-like string
    return [ast.literal_eval(block) for block in code_block]
    except (SyntaxError, ValueError) as e:
    return f"Error parsing code block: {e}"
    return None



    # Function to extract potential JSON/dict blocks
    def extract_code_block(text):
    # Try to find code blocks first with regex
    code_block = re.findall(r'```(?:json)?\s*({.*?})\s*```', text, re.DOTALL)

    # Try to parse the blocks if found
    if code_block:
    try:
    return [json.loads(block) for block in code_block]
    except json.JSONDecodeError:
    return None
    return None

    # Fallback function when parsing fails
    def fallback_extract(text, expected_keys):
    # Start extracting key-value pairs based on known keys
    fallback_dict = {}
    for i, key in enumerate(expected_keys):
    # Find the location of the key in the text
    match = re.search(rf'"{key}"\s*:\s*([^\s,]+)', text)
    if match:
    value = match.group(1).strip('"').strip(',')
    # Try to infer the type of the value (str, int, or dict)
    if value.isdigit():
    fallback_dict[key] = int(value)
    elif re.match(r'^\{.*\}$', value): # Detect dictionary structure
    try:
    fallback_dict[key] = json.loads(value)
    except json.JSONDecodeError:
    fallback_dict[key] = value # Leave it as a string if malformed
    else:
    fallback_dict[key] = value
    else:
    fallback_dict[key] = None # If the key is not found, set it to None
    return fallback_dict

    # Main function to handle parsing with fallback
    def parse_with_fallback(text, pydantic_class):
    # Extract expected keys from the Pydantic class
    expected_keys = list(pydantic_class.__fields__.keys())

    # First try to extract clean JSON blocks
    parsed_blocks = extract_code_block(text)

    if parsed_blocks:
    # Validate and return parsed data
    try:
    return [pydantic_class(**block) for block in parsed_blocks]
    except ValidationError as e:
    print("Validation error:", e)

    # Fallback to manually extracting key-value pairs
    fallback_data = fallback_extract(text, expected_keys)

    try:
    # Try to validate the fallback data with the Pydantic class
    return pydantic_class(**fallback_data)
    except ValidationError as e:
    return f"Error parsing with fallback: {e}"