essevan · March 8, 2025 14:06 · Mar 7, 2025 · Mar 7, 2025 · Mar 7, 2025 · Mar 7, 2025
diff --git a/IDP.md b/IDP.md
@@ -1,5 +1,3 @@
-Here's a Python script that combines both approaches - using Mistral OCR and the vision-capable LLM extraction approach:
-
 ```python
 import os
 import base64
@@ -25,31 +23,31 @@ def process_document_with_mistral_ocr(
     Process a document using Mistral OCR API and return the results.
     
     Args:
-        file_path (str): Path to the document file to process
+        file_path (str): Path to the document file to process.
         api_key (str, optional): Mistral API key. Defaults to MISTRAL_API_KEY environment variable.
         model (str, optional): Mistral OCR model to use. Defaults to "mistral-ocr-latest".
         output_format (str, optional): Output format - "markdown", "json", or "html". Defaults to "markdown".
         save_to_file (str, optional): Path to save the output. If None, returns the result.
         
     Returns:
-        The OCR results in the specified format
+        The OCR results in the specified format.
     """
     # Import here to make it optional
     from mistralai import Mistral
-    
+
     # Get API key from environment if not provided
     if not api_key:
         api_key = os.environ.get("MISTRAL_API_KEY")
         if not api_key:
             raise ValueError("No API key provided and MISTRAL_API_KEY environment variable not set.")
-    
+
     # Initialize Mistral client
     client = Mistral(api_key=api_key)
-    
+
     # Process the file
     pdf_file = Path(file_path)
     print(f"Uploading file {pdf_file.name}...")
-    
+
     # Upload the file
     uploaded_file = client.files.upload(
         file={
@@ -58,69 +56,70 @@ def process_document_with_mistral_ocr(
         },
         purpose="ocr",
     )
-    
+
     # Get signed URL for the uploaded file
     signed_url = client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)
-    
+
     print(f"Processing with OCR model: {model}...")
-    
+
     # Process the document with OCR
     ocr_response = client.ocr.process(
         document={"type": "document_url", "document_url": signed_url.url},
         model=model,
         include_image_base64=True,
     )
-    
+
     # Process the response based on the requested output format
     if output_format == "json":
         result = json.loads(ocr_response.model_dump_json())
     else:
         # Get markdown content
         result = ocr_response.pages.markdown
-        
+
         # Convert to HTML if requested
         if output_format == "html":
             import markdown
             result = markdown.markdown(result, extensions=['tables', 'fenced_code'])
             result = f"""
-
-
-    
-    OCR Result
-    
-        body {{ font-family: Arial, sans-serif; line-height: 1.6; max-width: 800px; margin: 0 auto; padding: 20px; }}
-        img {{ max-width: 100%; }}
-        pre {{ background-color: #f5f5f5; padding: 10px; overflow: auto; }}
-        table {{ border-collapse: collapse; width: 100%; }}
-        th, td {{ border: 1px solid #ddd; padding: 8px; }}
-        th {{ background-color: #f2f2f2; }}
-    
-
-
+<!DOCTYPE html>
+<html>
+<head>
+  <meta charset="UTF-8">
+  <style>
+    body {{ font-family: Arial, sans-serif; line-height: 1.6; max-width: 800px; margin: 0 auto; padding: 20px; }}
+    img {{ max-width: 100%; }}
+    pre {{ background-color: #f5f5f5; padding: 10px; overflow: auto; }}
+    table {{ border-collapse: collapse; width: 100%; }}
+    th, td {{ border: 1px solid #ddd; padding: 8px; }}
+    th {{ background-color: #f2f2f2; }}
+  </style>
+</head>
+<body>
 {result}
-
+</body>
+</html>
 """
-    
+
     # Save to file if requested
     if save_to_file:
         with open(save_to_file, 'w', encoding='utf-8') as f:
             f.write(result)
         print(f"Results saved to {save_to_file}")
         return None
-    
+
     return result
 
 # ===== VISION LLM EXTRACTION IMPLEMENTATION =====
 
-def encode_image_to_base64(image_path):
+def encode_image_to_base64(image_path: str) -> str:
     """Convert an image file to base64 encoding."""
     with open(image_path, "rb") as image_file:
         return base64.b64encode(image_file.read()).decode('utf-8')
 
-def extract_json_from_text(text):
+def extract_json_from_text(text: str) -> Optional[str]:
     """Extract JSON object from text that might contain additional content."""
-    # Try to find JSON in markdown code blocks
-    json_match = re.search(r'``````', text)
+    # Try to find JSON in markdown code blocks (using backticks)
+    json_match = re.search(r'```(?:json)?\s*([\s\S]*?)\s*```', text)
     if json_match:
         json_str = json_match.group(1)
     else:
@@ -130,27 +129,24 @@ def extract_json_from_text(text):
             json_str = json_match.group(1)
         else:
             json_str = text
-
-    # Clean the string and try to parse it
+
     json_str = json_str.strip()
     try:
         json.loads(json_str)
         return json_str
     except json.JSONDecodeError:
         return None
 
-def generate_extraction_prompt(model_class):
+def generate_extraction_prompt(model_class: Type[T]) -> str:
     """Generate a prompt based on a Pydantic model structure."""
     schema = model_class.model_json_schema()
-
     prompt = "Analyze this image and extract information in JSON format.\n\n"
-    prompt += f"Return the data according to this JSON schema:\n``````\n\n"
+    prompt += f"Return the data according to this JSON schema:\n```json\n{json.dumps(schema, indent=2)}\n```\n\n"
     prompt += "Important guidelines:\n"
-    prompt += "1. Only return valid JSON that conforms to the schema\n"
-    prompt += "2. If you're not sure about a field, use null instead of guessing\n"
-    prompt += "3. Don't add any explanations outside the JSON structure\n"
-    prompt += "4. Extract as much relevant information as possible\n"
-
+    prompt += "1. Only return valid JSON that conforms to the schema.\n"
+    prompt += "2. If you're not sure about a field, use null instead of guessing.\n"
+    prompt += "3. Don't add any explanations outside the JSON structure.\n"
+    prompt += "4. Extract as much relevant information as possible.\n"
     return prompt
 
 def extract_structured_data_from_image(
@@ -165,33 +161,33 @@ def extract_structured_data_from_image(
     Extract structured data from an image based on a Pydantic model.
     
     Args:
-        image_path: Path to the image file
-        model_class: Pydantic model class defining the structure to extract
-        model_name: Name of the vision model to use
-        api_base: API endpoint for the model
-        custom_prompt: Optional custom prompt to use instead of the generated one
-        max_retries: Maximum number of retry attempts if parsing fails
+        image_path (str): Path to the image file.
+        model_class (Type[T]): Pydantic model class defining the structure to extract.
+        model_name (str): Name of the vision model to use.
+        api_base (str): API endpoint for the model.
+        custom_prompt (Optional[str]): Optional custom prompt to use. If provided, this overrides the generated prompt.
+        max_retries (int): Maximum number of retry attempts if parsing fails.
         
     Returns:
-        An instance of the provided model_class with extracted data
+        An instance of the provided model_class with extracted data.
     """
-    # Import here to make it optional
     import litellm
-
-    # Get base64 string of the image
+
     try:
         base64_image = encode_image_to_base64(image_path)
     except Exception as e:
         print(f"Error encoding image: {e}")
         return model_class()
-    
-    # Generate or use the provided prompt
+
+    # Use a custom prompt if provided; otherwise, use generated prompt
     prompt = custom_prompt if custom_prompt else generate_extraction_prompt(model_class)
 
-    # Try extraction with retries
+    # For a simplified use case (like the additional code snippet), you might want to override
+    # the prompt with a fixed instruction. Uncomment the following line to use a fixed prompt:
+    # prompt = "Extract all the relevant data from the image in JSON format"
+
     for attempt in range(max_retries + 1):
         try:
-            # Make the API call to the model
             response = litellm.completion(
                 model=model_name,
                 messages=[
@@ -214,18 +210,15 @@ def extract_structured_data_from_image(
                 api_base=api_base
             )
 
-            # Extract the text response
             response_text = response.choices[0].message.content
-
-            # Extract JSON from the response
             json_str = extract_json_from_text(response_text)
 
             if json_str:
-                # Try to parse the extracted JSON
                 structured_data = model_class.model_validate_json(json_str)
                 return structured_data
             else:
                 if attempt == max_retries:
+                    print("Failed to extract JSON from response.")
                     return model_class()
         except Exception as e:
             print(f"Error in attempt {attempt + 1}: {e}")
@@ -314,7 +307,7 @@ class ReceiptData(BaseModel):
 
 if __name__ == "__main__":
     import argparse
-    
+
     parser = argparse.ArgumentParser(description="Extract data from documents and images")
     parser.add_argument("file_path", help="Path to the document or image file to process")
     parser.add_argument("--method", choices=["ocr", "vision"], default="ocr", 
@@ -326,9 +319,10 @@ if __name__ == "__main__":
     parser.add_argument("--output", "-o", help="Path to save the output")
     parser.add_argument("--schema", choices=["image", "document", "product", "receipt"], default="document",
                         help="Schema to use for vision method")
-
+    parser.add_argument("--custom_prompt", help="Optional custom prompt for vision extraction", default=None)
+
     args = parser.parse_args()
-    
+
     # Process based on method
     if args.method == "ocr":
         # Use Mistral OCR
@@ -339,59 +333,43 @@ if __name__ == "__main__":
             output_format=args.format,
             save_to_file=args.output
         )
-        
+
         if result and not args.output:
             print(result)
     else:
         # Use vision LLM extraction
-        # Select model class based on schema
         model_classes = {
             "image": ImageData,
             "document": DocumentData,
             "product": ProductData,
             "receipt": ReceiptData
         }
         model_class = model_classes[args.schema]
-
-        # Extract data
+
         result = extract_structured_data_from_image(
             args.file_path,
             model_class,
-            model_name=args.model or "ollama/llava-phi3"
+            model_name=args.model or "ollama/llava-phi3",
+            custom_prompt=args.custom_prompt,
         )
-
-        # Save or print result
+
         if args.output:
             save_structured_data(result, args.output)
         else:
             print(result.model_dump_json(indent=2))
 ```
 
-This script combines both approaches:
-
-1. Mistral OCR for document processing with high accuracy
-2. Vision-capable LLM extraction for structured data from images
+### Key Updates
+- **Custom vs. Generated Prompts:**  
+  You can now override the generated prompt by passing a `--custom_prompt` parameter. (The code also includes a commented line that you can uncomment if you prefer a fixed prompt.)
 
-You can use it from the command line with various options:
+- **Consistent Base64 Encoding:**  
+  The `encode_image_to_base64` function is shared between the helper and extraction functions.
 
-```bash
-# Use Mistral OCR to process a PDF and output markdown
-python document_extraction.py document.pdf --method ocr --format markdown --output result.md
+- **Unified Vision Extraction:**  
+  The vision extraction function uses the same `litellm.completion` call as in your provided snippet, ensuring the image is sent in a proper base64 format.
 
-# Use vision LLM to extract structured data from an image using the receipt schema
-python document_extraction.py receipt.jpg --method vision --schema receipt --output receipt_data.json
-```
-
-Or import the functions in your own code:
-
-```python
-from document_extraction import process_document_with_mistral_ocr, extract_structured_data_from_image, ReceiptData
-
-# Process a document with Mistral OCR
-ocr_result = process_document_with_mistral_ocr("document.pdf", output_format="markdown")
-
-# Extract structured data from a receipt image
-receipt_data = extract_structured_data_from_image("receipt.jpg", ReceiptData)
-```
+- **Command Line Flexibility:**  
+  Run the script from the command line using the provided examples, choosing between OCR and vision extraction modes.
 
-The script provides flexibility to choose the appropriate method based on your needs and the type of document you're processing.
+This updated script now brings together both methods with enhanced flexibility for your image and document processing needs.
diff --git a/IDP.md b/IDP.md
@@ -1,19 +1,117 @@
-# Extracting Structured Data from Images Using Pydantic and LLMs
-
-This guide demonstrates how to create a system that uses vision-capable LLMs to extract structured data from images using Pydantic for data validation.
-
-## Core Components
+Here's a Python script that combines both approaches - using Mistral OCR and the vision-capable LLM extraction approach:
 
 ```python
-import litellm
+import os
 import base64
 import json
 import re
 from typing import List, Dict, Any, Optional, Union, Type, TypeVar
 from pydantic import BaseModel, Field
+from pathlib import Path
 
+# Type variable for Pydantic models
 T = TypeVar('T', bound=BaseModel)
 
+# ===== MISTRAL OCR IMPLEMENTATION =====
+
+def process_document_with_mistral_ocr(
+    file_path, 
+    api_key=None, 
+    model="mistral-ocr-latest", 
+    output_format="markdown",
+    save_to_file=None
+):
+    """
+    Process a document using Mistral OCR API and return the results.
+    
+    Args:
+        file_path (str): Path to the document file to process
+        api_key (str, optional): Mistral API key. Defaults to MISTRAL_API_KEY environment variable.
+        model (str, optional): Mistral OCR model to use. Defaults to "mistral-ocr-latest".
+        output_format (str, optional): Output format - "markdown", "json", or "html". Defaults to "markdown".
+        save_to_file (str, optional): Path to save the output. If None, returns the result.
+        
+    Returns:
+        The OCR results in the specified format
+    """
+    # Import here to make it optional
+    from mistralai import Mistral
+
+    # Get API key from environment if not provided
+    if not api_key:
+        api_key = os.environ.get("MISTRAL_API_KEY")
+        if not api_key:
+            raise ValueError("No API key provided and MISTRAL_API_KEY environment variable not set.")
+
+    # Initialize Mistral client
+    client = Mistral(api_key=api_key)
+
+    # Process the file
+    pdf_file = Path(file_path)
+    print(f"Uploading file {pdf_file.name}...")
+
+    # Upload the file
+    uploaded_file = client.files.upload(
+        file={
+            "file_name": pdf_file.stem,
+            "content": pdf_file.read_bytes(),
+        },
+        purpose="ocr",
+    )
+
+    # Get signed URL for the uploaded file
+    signed_url = client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)
+
+    print(f"Processing with OCR model: {model}...")
+
+    # Process the document with OCR
+    ocr_response = client.ocr.process(
+        document={"type": "document_url", "document_url": signed_url.url},
+        model=model,
+        include_image_base64=True,
+    )
+
+    # Process the response based on the requested output format
+    if output_format == "json":
+        result = json.loads(ocr_response.model_dump_json())
+    else:
+        # Get markdown content
+        result = ocr_response.pages.markdown
+
+        # Convert to HTML if requested
+        if output_format == "html":
+            import markdown
+            result = markdown.markdown(result, extensions=['tables', 'fenced_code'])
+            result = f"""
+
+
+    
+    OCR Result
+    
+        body {{ font-family: Arial, sans-serif; line-height: 1.6; max-width: 800px; margin: 0 auto; padding: 20px; }}
+        img {{ max-width: 100%; }}
+        pre {{ background-color: #f5f5f5; padding: 10px; overflow: auto; }}
+        table {{ border-collapse: collapse; width: 100%; }}
+        th, td {{ border: 1px solid #ddd; padding: 8px; }}
+        th {{ background-color: #f2f2f2; }}
+    
+
+
+{result}
+
+"""
+
+    # Save to file if requested
+    if save_to_file:
+        with open(save_to_file, 'w', encoding='utf-8') as f:
+            f.write(result)
+        print(f"Results saved to {save_to_file}")
+        return None
+
+    return result
+
+# ===== VISION LLM EXTRACTION IMPLEMENTATION =====
+
 def encode_image_to_base64(image_path):
     """Convert an image file to base64 encoding."""
     with open(image_path, "rb") as image_file:
@@ -77,6 +175,9 @@ def extract_structured_data_from_image(
     Returns:
         An instance of the provided model_class with extracted data
     """
+    # Import here to make it optional
+    import litellm
+
     # Get base64 string of the image
     try:
         base64_image = encode_image_to_base64(image_path)
@@ -132,11 +233,9 @@ def extract_structured_data_from_image(
                 return model_class()
 
     return model_class()
-```
 
-## Utility Functions
+# ===== UTILITY FUNCTIONS =====
 
-```python
 def save_structured_data(data: BaseModel, output_path: str):
     """Save structured data to a JSON file."""
     with open(output_path, "w") as f:
@@ -148,11 +247,9 @@ def read_structured_data(file_path: str, model_class: Type[T]) -> T:
     with open(file_path, "r") as f:
         json_data = f.read()
     return model_class.model_validate_json(json_data)
-```
 
-## Pydantic Models
+# ===== PYDANTIC MODELS =====
 
-```python
 class Person(BaseModel):
     """Data model for a person detected in an image."""
     name: Optional[str] = None
@@ -203,55 +300,98 @@ class ProductData(BaseModel):
     features: List[str] = Field(default_factory=list)
     condition: Optional[str] = None
     estimated_price_range: Optional[str] = None
-```
 
-## Example Usage
+class ReceiptData(BaseModel):
+    """Model for extracting information from receipts."""
+    store_name: Optional[str] = None
+    date: Optional[str] = None
+    total_amount: Optional[str] = None
+    items: List[Dict[str, Any]] = Field(default_factory=list)
+    payment_method: Optional[str] = None
+    tax_amount: Optional[str] = None
+
+# ===== EXAMPLE USAGE =====
 
-```python
-# Example usage
 if __name__ == "__main__":
-    # Define your image path
-    image_path = "path/to/your/image.jpg"
-
-    # Extract general image data
-    image_data = extract_structured_data_from_image(image_path, ImageData)
-
-    # Print the extracted data
-    print(image_data.model_dump_json(indent=2))
+    import argparse
 
-    # Save the extracted data
-    save_structured_data(image_data, "extracted_image_data.json")
+    parser = argparse.ArgumentParser(description="Extract data from documents and images")
+    parser.add_argument("file_path", help="Path to the document or image file to process")
+    parser.add_argument("--method", choices=["ocr", "vision"], default="ocr", 
+                        help="Method to use: 'ocr' for Mistral OCR or 'vision' for vision LLM extraction")
+    parser.add_argument("--api-key", help="API key (defaults to environment variable)")
+    parser.add_argument("--model", help="Model to use (defaults based on method)")
+    parser.add_argument("--format", choices=["markdown", "json", "html"], default="markdown", 
+                        help="Output format for OCR method")
+    parser.add_argument("--output", "-o", help="Path to save the output")
+    parser.add_argument("--schema", choices=["image", "document", "product", "receipt"], default="document",
+                        help="Schema to use for vision method")
 
-    # Extract data with a different model
-    document_data = extract_structured_data_from_image(image_path, DocumentData)
-    print(document_data.model_dump_json(indent=2))
+    args = parser.parse_args()
 
-    # Custom prompt example
-    custom_prompt = """
-    Examine this image carefully and extract the following information:
-    1. Identify all people present
-    2. Describe their clothing in detail
-    3. List all visible objects
-    4. Note any text visible in the image
-
-    Format the response as valid JSON according to the provided schema.
-    """
+    # Process based on method
+    if args.method == "ocr":
+        # Use Mistral OCR
+        result = process_document_with_mistral_ocr(
+            args.file_path,
+            api_key=args.api_key,
+            model=args.model or "mistral-ocr-latest",
+            output_format=args.format,
+            save_to_file=args.output
+        )
+
+        if result and not args.output:
+            print(result)
+    else:
+        # Use vision LLM extraction
+        # Select model class based on schema
+        model_classes = {
+            "image": ImageData,
+            "document": DocumentData,
+            "product": ProductData,
+            "receipt": ReceiptData
+        }
+        model_class = model_classes[args.schema]
+
+        # Extract data
+        result = extract_structured_data_from_image(
+            args.file_path,
+            model_class,
+            model_name=args.model or "ollama/llava-phi3"
+        )
+
+        # Save or print result
+        if args.output:
+            save_structured_data(result, args.output)
+        else:
+            print(result.model_dump_json(indent=2))
+```
 
-    data = extract_structured_data_from_image(
-        image_path, 
-        ImageData, 
-        custom_prompt=custom_prompt
-    )
-
-    # Custom model example
-    class ReceiptData(BaseModel):
-        """Model for extracting information from receipts."""
-        store_name: Optional[str] = None
-        date: Optional[str] = None
-        total_amount: Optional[str] = None
-        items: List[Dict[str, Any]] = Field(default_factory=list)
-        payment_method: Optional[str] = None
-        tax_amount: Optional[str] = None
-
-    receipt_data = extract_structured_data_from_image("receipt.jpg", ReceiptData)
+This script combines both approaches:
+
+1. Mistral OCR for document processing with high accuracy
+2. Vision-capable LLM extraction for structured data from images
+
+You can use it from the command line with various options:
+
+```bash
+# Use Mistral OCR to process a PDF and output markdown
+python document_extraction.py document.pdf --method ocr --format markdown --output result.md
+
+# Use vision LLM to extract structured data from an image using the receipt schema
+python document_extraction.py receipt.jpg --method vision --schema receipt --output receipt_data.json
+```
+
+Or import the functions in your own code:
+
+```python
+from document_extraction import process_document_with_mistral_ocr, extract_structured_data_from_image, ReceiptData
+
+# Process a document with Mistral OCR
+ocr_result = process_document_with_mistral_ocr("document.pdf", output_format="markdown")
+
+# Extract structured data from a receipt image
+receipt_data = extract_structured_data_from_image("receipt.jpg", ReceiptData)
 ```
+
+The script provides flexibility to choose the appropriate method based on your needs and the type of document you're processing.
diff --git a/IDP.md b/IDP.md
@@ -1,103 +1,8 @@
 # Extracting Structured Data from Images Using Pydantic and LLMs
 
-This comprehensive guide demonstrates how to create a robust system that uses Large Language Models (LLMs) with vision capabilities to extract structured data from images. By combining Pydantic's data validation with LLMs' image analysis capabilities, we can transform unstructured visual information into well-defined, type-safe Python objects.
+This guide demonstrates how to create a system that uses vision-capable LLMs to extract structured data from images using Pydantic for data validation.
 
-To create Mermaid JS diagrams that work well in GitHub Gists, you need to use the proper syntax within a code block with the "mermaid" language identifier. GitHub has native support for Mermaid diagrams, allowing you to create various visualizations directly in your Gists.
-
-## Basic Mermaid Setup for GitHub Gists
-
-Simply wrap your Mermaid syntax in a code block with the `mermaid` language identifier:
-
-```mermaid
-graph TD;
-    A-->B;
-    A-->C;
-    B-->D;
-    C-->D;
-```
-
-## Common Diagram Types
-
-### Flowchart
-
-```mermaid
-flowchart LR
-    A[Start] --> B{Decision}
-    B -- Yes --> C[Process 1]
-    B -- No --> D[Process 2]
-    C --> E[End]
-    D --> E
-```
-
-### Sequence Diagram
-
-```mermaid
-sequenceDiagram
-    participant Client
-    participant Server
-    Client->>Server: Request Data
-    Server->>Client: Response
-    Note over Client,Server: Simple HTTP Request
-```
-
-### Class Diagram
-
-```mermaid
-classDiagram
-    Animal <|-- Duck
-    Animal <|-- Fish
-    Animal: +int age
-    Animal: +String gender
-    class Duck{
-        +String beakColor
-        +swim()
-        +quack()
-    }
-```
-
-### Pie Chart
-
-```mermaid
-pie
-    title Distribution
-    "Category A" : 42
-    "Category B" : 28
-    "Category C" : 30
-```
-
-### Git Graph
-
-```mermaid
-gitGraph
-    commit id: "initial"
-    branch develop
-    checkout develop
-    commit id: "feature"
-    checkout main
-    merge develop tag: "v1.0.0"
-    commit id: "hotfix"
-```
-
-## Tips for GitHub Gists
-
-1. Ensure each diagram has its own separate code block
-2. Add semicolons at the end of each line for better compatibility
-3. Keep diagrams simple as complex ones may not render correctly
-4. Use the Mermaid Live Editor to test your diagrams before adding them to Gists
-
-## Understanding the Need for Structured Image Data
-
-Visual information is inherently unstructured, making it challenging to process programmatically. Images contain rich information—people, objects, text, scenes—but extracting this data in a consistent, usable format requires sophisticated tools and techniques. The integration of Pydantic with vision-capable LLMs provides an elegant solution to this problem.
-
-Pydantic offers robust data validation and schema generation, while LLMs can analyze and interpret visual content. Together, they form a powerful pipeline for extracting structured data from images that can be seamlessly integrated into your applications.
-
-## Core Components of the Solution
-
-The solution consists of several key components that work together to transform image data into structured Pydantic objects:
-
-### Base Image Data Extraction Function
-
-This function serves as the core component that processes an image and returns structured data based on a Pydantic model:
+## Core Components
 
 ```python
 import litellm
@@ -219,15 +124,17 @@ def extract_structured_data_from_image(
                 structured_data = model_class.model_validate_json(json_str)
                 return structured_data
             else:
-                if attempt = max_retries:
+                if attempt == max_retries:
+                    return model_class()
+        except Exception as e:
+            print(f"Error in attempt {attempt + 1}: {e}")
+            if attempt == max_retries:
                 return model_class()
 
     return model_class()
 ```
 
-### Utility Functions for Data Handling
-
-These functions help manage the extracted data:
+## Utility Functions
 
 ```python
 def save_structured_data(data: BaseModel, output_path: str):
@@ -243,11 +150,7 @@ def read_structured_data(file_path: str, model_class: Type[T]) -> T:
     return model_class.model_validate_json(json_data)
 ```
 
-## Defining Pydantic Models for Different Use Cases
-
-The power of this approach lies in its flexibility. By defining different Pydantic models, we can extract various types of structured data from images:
-
-### General Image Analysis Model
+## Pydantic Models
 
 ```python
 class Person(BaseModel):
@@ -281,11 +184,7 @@ class ImageData(BaseModel):
     scene_description: SceneDescription = Field(default_factory=SceneDescription)
     text_in_image: Optional[List[str]] = None
     additional_notes: Optional[str] = None
-```
 
-### Document Analysis Model
-
-```python
 class DocumentData(BaseModel):
     """Data model for extracting information from document images."""
     title: Optional[str] = None
@@ -294,11 +193,7 @@ class DocumentData(BaseModel):
     content_summary: Optional[str] = None
     key_points: List[str] = Field(default_factory=list)
     entities_mentioned: List[str] = Field(default_factory=list)
-```
-
-### Product Information Model
 
-```python
 class ProductData(BaseModel):
     """Data model for extracting product information from images."""
     product_name: Optional[str] = None
@@ -310,25 +205,7 @@ class ProductData(BaseModel):
     estimated_price_range: Optional[str] = None
 ```
 
-## Implementation Details and Technical Considerations
-
-The implementation includes several important technical aspects that ensure reliability and robustness:
-
-### Prompt Engineering
-
-The system automatically generates effective prompts based on the Pydantic model's JSON schema. This ensures the LLM understands exactly what information to extract and in what format. The prompt generator creates detailed instructions that guide the model to produce properly structured outputs.
-
-### Error Handling and Retries
-
-The extraction function implements a retry mechanism that attempts to recover from parsing failures. If the LLM produces invalid or unparseable JSON, the system will retry with a more explicit prompt that emphasizes the need for valid JSON output.
-
-### JSON Extraction from Messy Responses
-
-LLMs sometimes include explanatory text or markdown formatting in their responses. The `extract_json_from_text` function uses regular expressions to find and extract valid JSON from potentially messy responses, making the system more robust.
-
-## Using the System in Practice
-
-Here's how to use the system to extract structured data from images:
+## Example Usage
 
 ```python
 # Example usage
@@ -348,69 +225,33 @@ if __name__ == "__main__":
     # Extract data with a different model
     document_data = extract_structured_data_from_image(image_path, DocumentData)
     print(document_data.model_dump_json(indent=2))
-```
-
-## Advanced Features and Customization
-
-The system is designed to be flexible and customizable:
-
-### Custom Prompts
-
-While the default prompt generator works well for most cases, you can supply custom prompts to guide the extraction process more specifically:
-
-```python
-custom_prompt = """
-Examine this image carefully and extract the following information:
-1. Identify all people present
-2. Describe their clothing in detail
-3. List all visible objects
-4. Note any text visible in the image
-
-Format the response as valid JSON according to the provided schema.
-"""
-
-data = extract_structured_data_from_image(
-    image_path, 
-    ImageData, 
-    custom_prompt=custom_prompt
-)
-```
-
-### Custom Model Definitions
-
-You can create custom Pydantic models to extract exactly the information you need:
-
-```python
-class ReceiptData(BaseModel):
-    """Model for extracting information from receipts."""
-    store_name: Optional[str] = None
-    date: Optional[str] = None
-    total_amount: Optional[str] = None
-    items: List[Dict[str, Any]] = Field(default_factory=list)
-    payment_method: Optional[str] = None
-    tax_amount: Optional[str] = None
+
+    # Custom prompt example
+    custom_prompt = """
+    Examine this image carefully and extract the following information:
+    1. Identify all people present
+    2. Describe their clothing in detail
+    3. List all visible objects
+    4. Note any text visible in the image
+
+    Format the response as valid JSON according to the provided schema.
+    """
 
-receipt_data = extract_structured_data_from_image("receipt.jpg", ReceiptData)
+    data = extract_structured_data_from_image(
+        image_path, 
+        ImageData, 
+        custom_prompt=custom_prompt
+    )
+
+    # Custom model example
+    class ReceiptData(BaseModel):
+        """Model for extracting information from receipts."""
+        store_name: Optional[str] = None
+        date: Optional[str] = None
+        total_amount: Optional[str] = None
+        items: List[Dict[str, Any]] = Field(default_factory=list)
+        payment_method: Optional[str] = None
+        tax_amount: Optional[str] = None
+
+    receipt_data = extract_structured_data_from_image("receipt.jpg", ReceiptData)
 ```
-
-## Technical Implementation Details
-
-The solution leverages several key technologies:
-
-1. **Pydantic** for data validation and schema generation[1][5][7]
-2. **LiteLLM** as a client for interacting with LLM APIs
-3. **Regular Expressions** for extracting JSON from model responses
-4. **Type Hints** to ensure code correctness and enable better IDE support
-
-The implementation follows best practices for error handling, retries, and data validation. It uses Pydantic's JSON validation capabilities to ensure the extracted data conforms to the expected structure[4][6].
-
-## Conclusion
-
-The integration of Pydantic with vision-capable LLMs provides a powerful framework for extracting structured data from images. This approach offers several advantages:
-
-1. **Type Safety**: The extracted data is guaranteed to match the defined structure.
-2. **Flexibility**: Custom models can be defined for different extraction needs.
-3. **Robustness**: Error handling and retry mechanisms ensure reliable operation.
-4. **Ease of Use**: The extracted data is immediately available as Python objects.
-
-By following the approach outlined in this guide, you can transform unstructured visual information into structured, validated data that can be easily integrated into your applications and workflows. This enables numerous applications, from document processing to scene analysis, product recognition, and beyond.
diff --git a/IDP.md b/IDP.md
@@ -2,6 +2,89 @@
 
 This comprehensive guide demonstrates how to create a robust system that uses Large Language Models (LLMs) with vision capabilities to extract structured data from images. By combining Pydantic's data validation with LLMs' image analysis capabilities, we can transform unstructured visual information into well-defined, type-safe Python objects.
 
+To create Mermaid JS diagrams that work well in GitHub Gists, you need to use the proper syntax within a code block with the "mermaid" language identifier. GitHub has native support for Mermaid diagrams, allowing you to create various visualizations directly in your Gists.
+
+## Basic Mermaid Setup for GitHub Gists
+
+Simply wrap your Mermaid syntax in a code block with the `mermaid` language identifier:
+
+```mermaid
+graph TD;
+    A-->B;
+    A-->C;
+    B-->D;
+    C-->D;
+```
+
+## Common Diagram Types
+
+### Flowchart
+
+```mermaid
+flowchart LR
+    A[Start] --> B{Decision}
+    B -- Yes --> C[Process 1]
+    B -- No --> D[Process 2]
+    C --> E[End]
+    D --> E
+```
+
+### Sequence Diagram
+
+```mermaid
+sequenceDiagram
+    participant Client
+    participant Server
+    Client->>Server: Request Data
+    Server->>Client: Response
+    Note over Client,Server: Simple HTTP Request
+```
+
+### Class Diagram
+
+```mermaid
+classDiagram
+    Animal <|-- Duck
+    Animal <|-- Fish
+    Animal: +int age
+    Animal: +String gender
+    class Duck{
+        +String beakColor
+        +swim()
+        +quack()
+    }
+```
+
+### Pie Chart
+
+```mermaid
+pie
+    title Distribution
+    "Category A" : 42
+    "Category B" : 28
+    "Category C" : 30
+```
+
+### Git Graph
+
+```mermaid
+gitGraph
+    commit id: "initial"
+    branch develop
+    checkout develop
+    commit id: "feature"
+    checkout main
+    merge develop tag: "v1.0.0"
+    commit id: "hotfix"
+```
+
+## Tips for GitHub Gists
+
+1. Ensure each diagram has its own separate code block
+2. Add semicolons at the end of each line for better compatibility
+3. Keep diagrams simple as complex ones may not render correctly
+4. Use the Mermaid Live Editor to test your diagrams before adding them to Gists
+
 ## Understanding the Need for Structured Image Data
 
 Visual information is inherently unstructured, making it challenging to process programmatically. Images contain rich information—people, objects, text, scenes—but extracting this data in a consistent, usable format requires sophisticated tools and techniques. The integration of Pydantic with vision-capable LLMs provides an elegant solution to this problem.

diff --git a/IDP.md b/IDP.md
@@ -0,0 +1,333 @@
+# Extracting Structured Data from Images Using Pydantic and LLMs
+
+This comprehensive guide demonstrates how to create a robust system that uses Large Language Models (LLMs) with vision capabilities to extract structured data from images. By combining Pydantic's data validation with LLMs' image analysis capabilities, we can transform unstructured visual information into well-defined, type-safe Python objects.
+
+## Understanding the Need for Structured Image Data
+
+Visual information is inherently unstructured, making it challenging to process programmatically. Images contain rich information—people, objects, text, scenes—but extracting this data in a consistent, usable format requires sophisticated tools and techniques. The integration of Pydantic with vision-capable LLMs provides an elegant solution to this problem.
+
+Pydantic offers robust data validation and schema generation, while LLMs can analyze and interpret visual content. Together, they form a powerful pipeline for extracting structured data from images that can be seamlessly integrated into your applications.
+
+## Core Components of the Solution
+
+The solution consists of several key components that work together to transform image data into structured Pydantic objects:
+
+### Base Image Data Extraction Function
+
+This function serves as the core component that processes an image and returns structured data based on a Pydantic model:
+
+```python
+import litellm
+import base64
+import json
+import re
+from typing import List, Dict, Any, Optional, Union, Type, TypeVar
+from pydantic import BaseModel, Field
+
+T = TypeVar('T', bound=BaseModel)
+
+def encode_image_to_base64(image_path):
+    """Convert an image file to base64 encoding."""
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
+
+def extract_json_from_text(text):
+    """Extract JSON object from text that might contain additional content."""
+    # Try to find JSON in markdown code blocks
+    json_match = re.search(r'``````', text)
+    if json_match:
+        json_str = json_match.group(1)
+    else:
+        # Try to find JSON based on braces
+        json_match = re.search(r'({[\s\S]*})', text)
+        if json_match:
+            json_str = json_match.group(1)
+        else:
+            json_str = text
+
+    # Clean the string and try to parse it
+    json_str = json_str.strip()
+    try:
+        json.loads(json_str)
+        return json_str
+    except json.JSONDecodeError:
+        return None
+
+def generate_extraction_prompt(model_class):
+    """Generate a prompt based on a Pydantic model structure."""
+    schema = model_class.model_json_schema()
+
+    prompt = "Analyze this image and extract information in JSON format.\n\n"
+    prompt += f"Return the data according to this JSON schema:\n``````\n\n"
+    prompt += "Important guidelines:\n"
+    prompt += "1. Only return valid JSON that conforms to the schema\n"
+    prompt += "2. If you're not sure about a field, use null instead of guessing\n"
+    prompt += "3. Don't add any explanations outside the JSON structure\n"
+    prompt += "4. Extract as much relevant information as possible\n"
+
+    return prompt
+
+def extract_structured_data_from_image(
+    image_path: str, 
+    model_class: Type[T],
+    model_name: str = "ollama/llava-phi3", 
+    api_base: str = "http://localhost:11434",
+    custom_prompt: Optional[str] = None,
+    max_retries: int = 2
+) -> T:
+    """
+    Extract structured data from an image based on a Pydantic model.
+    
+    Args:
+        image_path: Path to the image file
+        model_class: Pydantic model class defining the structure to extract
+        model_name: Name of the vision model to use
+        api_base: API endpoint for the model
+        custom_prompt: Optional custom prompt to use instead of the generated one
+        max_retries: Maximum number of retry attempts if parsing fails
+        
+    Returns:
+        An instance of the provided model_class with extracted data
+    """
+    # Get base64 string of the image
+    try:
+        base64_image = encode_image_to_base64(image_path)
+    except Exception as e:
+        print(f"Error encoding image: {e}")
+        return model_class()
+
+    # Generate or use the provided prompt
+    prompt = custom_prompt if custom_prompt else generate_extraction_prompt(model_class)
+
+    # Try extraction with retries
+    for attempt in range(max_retries + 1):
+        try:
+            # Make the API call to the model
+            response = litellm.completion(
+                model=model_name,
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": prompt
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:image/png;base64,{base64_image}"
+                                }
+                            }
+                        ]
+                    }
+                ],
+                api_base=api_base
+            )
+
+            # Extract the text response
+            response_text = response.choices[0].message.content
+
+            # Extract JSON from the response
+            json_str = extract_json_from_text(response_text)
+
+            if json_str:
+                # Try to parse the extracted JSON
+                structured_data = model_class.model_validate_json(json_str)
+                return structured_data
+            else:
+                if attempt = max_retries:
+                return model_class()
+
+    return model_class()
+```
+
+### Utility Functions for Data Handling
+
+These functions help manage the extracted data:
+
+```python
+def save_structured_data(data: BaseModel, output_path: str):
+    """Save structured data to a JSON file."""
+    with open(output_path, "w") as f:
+        f.write(data.model_dump_json(indent=2))
+    print(f"Data saved to {output_path}")
+
+def read_structured_data(file_path: str, model_class: Type[T]) -> T:
+    """Read structured data from a JSON file into a Pydantic model."""
+    with open(file_path, "r") as f:
+        json_data = f.read()
+    return model_class.model_validate_json(json_data)
+```
+
+## Defining Pydantic Models for Different Use Cases
+
+The power of this approach lies in its flexibility. By defining different Pydantic models, we can extract various types of structured data from images:
+
+### General Image Analysis Model
+
+```python
+class Person(BaseModel):
+    """Data model for a person detected in an image."""
+    name: Optional[str] = None
+    gender: Optional[str] = None
+    approximate_age: Optional[str] = None
+    clothing_description: Optional[str] = None
+    position_in_image: Optional[str] = None
+
+class Object(BaseModel):
+    """Data model for an object detected in an image."""
+    name: str
+    color: Optional[str] = None
+    size: Optional[str] = None
+    position_in_image: Optional[str] = None
+    quantity: Optional[int] = 1
+
+class SceneDescription(BaseModel):
+    """Data model for an overall scene description."""
+    setting: Optional[str] = None
+    time_of_day: Optional[str] = None
+    weather: Optional[str] = None
+    general_mood: Optional[str] = None
+    key_activities: Optional[List[str]] = None
+
+class ImageData(BaseModel):
+    """Overall data model for information extracted from an image."""
+    persons: List[Person] = Field(default_factory=list)
+    objects: List[Object] = Field(default_factory=list)
+    scene_description: SceneDescription = Field(default_factory=SceneDescription)
+    text_in_image: Optional[List[str]] = None
+    additional_notes: Optional[str] = None
+```
+
+### Document Analysis Model
+
+```python
+class DocumentData(BaseModel):
+    """Data model for extracting information from document images."""
+    title: Optional[str] = None
+    date: Optional[str] = None
+    document_type: Optional[str] = None
+    content_summary: Optional[str] = None
+    key_points: List[str] = Field(default_factory=list)
+    entities_mentioned: List[str] = Field(default_factory=list)
+```
+
+### Product Information Model
+
+```python
+class ProductData(BaseModel):
+    """Data model for extracting product information from images."""
+    product_name: Optional[str] = None
+    brand: Optional[str] = None
+    category: Optional[str] = None
+    color: Optional[str] = None
+    features: List[str] = Field(default_factory=list)
+    condition: Optional[str] = None
+    estimated_price_range: Optional[str] = None
+```
+
+## Implementation Details and Technical Considerations
+
+The implementation includes several important technical aspects that ensure reliability and robustness:
+
+### Prompt Engineering
+
+The system automatically generates effective prompts based on the Pydantic model's JSON schema. This ensures the LLM understands exactly what information to extract and in what format. The prompt generator creates detailed instructions that guide the model to produce properly structured outputs.
+
+### Error Handling and Retries
+
+The extraction function implements a retry mechanism that attempts to recover from parsing failures. If the LLM produces invalid or unparseable JSON, the system will retry with a more explicit prompt that emphasizes the need for valid JSON output.
+
+### JSON Extraction from Messy Responses
+
+LLMs sometimes include explanatory text or markdown formatting in their responses. The `extract_json_from_text` function uses regular expressions to find and extract valid JSON from potentially messy responses, making the system more robust.
+
+## Using the System in Practice
+
+Here's how to use the system to extract structured data from images:
+
+```python
+# Example usage
+if __name__ == "__main__":
+    # Define your image path
+    image_path = "path/to/your/image.jpg"
+
+    # Extract general image data
+    image_data = extract_structured_data_from_image(image_path, ImageData)
+
+    # Print the extracted data
+    print(image_data.model_dump_json(indent=2))
+
+    # Save the extracted data
+    save_structured_data(image_data, "extracted_image_data.json")
+
+    # Extract data with a different model
+    document_data = extract_structured_data_from_image(image_path, DocumentData)
+    print(document_data.model_dump_json(indent=2))
+```
+
+## Advanced Features and Customization
+
+The system is designed to be flexible and customizable:
+
+### Custom Prompts
+
+While the default prompt generator works well for most cases, you can supply custom prompts to guide the extraction process more specifically:
+
+```python
+custom_prompt = """
+Examine this image carefully and extract the following information:
+1. Identify all people present
+2. Describe their clothing in detail
+3. List all visible objects
+4. Note any text visible in the image
+
+Format the response as valid JSON according to the provided schema.
+"""
+
+data = extract_structured_data_from_image(
+    image_path, 
+    ImageData, 
+    custom_prompt=custom_prompt
+)
+```
+
+### Custom Model Definitions
+
+You can create custom Pydantic models to extract exactly the information you need:
+
+```python
+class ReceiptData(BaseModel):
+    """Model for extracting information from receipts."""
+    store_name: Optional[str] = None
+    date: Optional[str] = None
+    total_amount: Optional[str] = None
+    items: List[Dict[str, Any]] = Field(default_factory=list)
+    payment_method: Optional[str] = None
+    tax_amount: Optional[str] = None
+
+receipt_data = extract_structured_data_from_image("receipt.jpg", ReceiptData)
+```
+
+## Technical Implementation Details
+
+The solution leverages several key technologies:
+
+1. **Pydantic** for data validation and schema generation[1][5][7]
+2. **LiteLLM** as a client for interacting with LLM APIs
+3. **Regular Expressions** for extracting JSON from model responses
+4. **Type Hints** to ensure code correctness and enable better IDE support
+
+The implementation follows best practices for error handling, retries, and data validation. It uses Pydantic's JSON validation capabilities to ensure the extracted data conforms to the expected structure[4][6].
+
+## Conclusion
+
+The integration of Pydantic with vision-capable LLMs provides a powerful framework for extracting structured data from images. This approach offers several advantages:
+
+1. **Type Safety**: The extracted data is guaranteed to match the defined structure.
+2. **Flexibility**: Custom models can be defined for different extraction needs.
+3. **Robustness**: Error handling and retry mechanisms ensure reliable operation.
+4. **Ease of Use**: The extracted data is immediately available as Python objects.
+
+By following the approach outlined in this guide, you can transform unstructured visual information into structured, validated data that can be easily integrated into your applications and workflows. This enables numerous applications, from document processing to scene analysis, product recognition, and beyond.