-
-
Save essevan/f8c685be596d0eed7fe377336d1ff3a5 to your computer and use it in GitHub Desktop.
Revisions
-
CraftsMan-Labs revised this gist
Mar 7, 2025 . 1 changed file with 74 additions and 96 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,5 +1,3 @@ ```python import os import base64 @@ -25,31 +23,31 @@ def process_document_with_mistral_ocr( Process a document using Mistral OCR API and return the results. Args: file_path (str): Path to the document file to process. api_key (str, optional): Mistral API key. Defaults to MISTRAL_API_KEY environment variable. model (str, optional): Mistral OCR model to use. Defaults to "mistral-ocr-latest". output_format (str, optional): Output format - "markdown", "json", or "html". Defaults to "markdown". save_to_file (str, optional): Path to save the output. If None, returns the result. Returns: The OCR results in the specified format. """ # Import here to make it optional from mistralai import Mistral # Get API key from environment if not provided if not api_key: api_key = os.environ.get("MISTRAL_API_KEY") if not api_key: raise ValueError("No API key provided and MISTRAL_API_KEY environment variable not set.") # Initialize Mistral client client = Mistral(api_key=api_key) # Process the file pdf_file = Path(file_path) print(f"Uploading file {pdf_file.name}...") # Upload the file uploaded_file = client.files.upload( file={ @@ -58,69 +56,70 @@ def process_document_with_mistral_ocr( }, purpose="ocr", ) # Get signed URL for the uploaded file signed_url = client.files.get_signed_url(file_id=uploaded_file.id, expiry=1) print(f"Processing with OCR model: {model}...") # Process the document with OCR ocr_response = client.ocr.process( document={"type": "document_url", "document_url": signed_url.url}, model=model, include_image_base64=True, ) # Process the response based on the requested output format if output_format == "json": result = json.loads(ocr_response.model_dump_json()) else: # Get markdown content result = ocr_response.pages.markdown # Convert to HTML if requested if output_format == "html": import markdown result = markdown.markdown(result, extensions=['tables', 'fenced_code']) result = f""" <!DOCTYPE html> <html> <head> <meta charset="UTF-8"> <style> body {{ font-family: Arial, sans-serif; line-height: 1.6; max-width: 800px; margin: 0 auto; padding: 20px; }} img {{ max-width: 100%; }} pre {{ background-color: #f5f5f5; padding: 10px; overflow: auto; }} table {{ border-collapse: collapse; width: 100%; }} th, td {{ border: 1px solid #ddd; padding: 8px; }} th {{ background-color: #f2f2f2; }} </style> </head> <body> {result} </body> </html> """ # Save to file if requested if save_to_file: with open(save_to_file, 'w', encoding='utf-8') as f: f.write(result) print(f"Results saved to {save_to_file}") return None return result # ===== VISION LLM EXTRACTION IMPLEMENTATION ===== def encode_image_to_base64(image_path: str) -> str: """Convert an image file to base64 encoding.""" with open(image_path, "rb") as image_file: return base64.b64encode(image_file.read()).decode('utf-8') def extract_json_from_text(text: str) -> Optional[str]: """Extract JSON object from text that might contain additional content.""" # Try to find JSON in markdown code blocks (using backticks) json_match = re.search(r'```(?:json)?\s*([\s\S]*?)\s*```', text) if json_match: json_str = json_match.group(1) else: @@ -130,27 +129,24 @@ def extract_json_from_text(text): json_str = json_match.group(1) else: json_str = text json_str = json_str.strip() try: json.loads(json_str) return json_str except json.JSONDecodeError: return None def generate_extraction_prompt(model_class: Type[T]) -> str: """Generate a prompt based on a Pydantic model structure.""" schema = model_class.model_json_schema() prompt = "Analyze this image and extract information in JSON format.\n\n" prompt += f"Return the data according to this JSON schema:\n```json\n{json.dumps(schema, indent=2)}\n```\n\n" prompt += "Important guidelines:\n" prompt += "1. Only return valid JSON that conforms to the schema.\n" prompt += "2. If you're not sure about a field, use null instead of guessing.\n" prompt += "3. Don't add any explanations outside the JSON structure.\n" prompt += "4. Extract as much relevant information as possible.\n" return prompt def extract_structured_data_from_image( @@ -165,33 +161,33 @@ def extract_structured_data_from_image( Extract structured data from an image based on a Pydantic model. Args: image_path (str): Path to the image file. model_class (Type[T]): Pydantic model class defining the structure to extract. model_name (str): Name of the vision model to use. api_base (str): API endpoint for the model. custom_prompt (Optional[str]): Optional custom prompt to use. If provided, this overrides the generated prompt. max_retries (int): Maximum number of retry attempts if parsing fails. Returns: An instance of the provided model_class with extracted data. """ import litellm try: base64_image = encode_image_to_base64(image_path) except Exception as e: print(f"Error encoding image: {e}") return model_class() # Use a custom prompt if provided; otherwise, use generated prompt prompt = custom_prompt if custom_prompt else generate_extraction_prompt(model_class) # For a simplified use case (like the additional code snippet), you might want to override # the prompt with a fixed instruction. Uncomment the following line to use a fixed prompt: # prompt = "Extract all the relevant data from the image in JSON format" for attempt in range(max_retries + 1): try: response = litellm.completion( model=model_name, messages=[ @@ -214,18 +210,15 @@ def extract_structured_data_from_image( api_base=api_base ) response_text = response.choices[0].message.content json_str = extract_json_from_text(response_text) if json_str: structured_data = model_class.model_validate_json(json_str) return structured_data else: if attempt == max_retries: print("Failed to extract JSON from response.") return model_class() except Exception as e: print(f"Error in attempt {attempt + 1}: {e}") @@ -314,7 +307,7 @@ class ReceiptData(BaseModel): if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Extract data from documents and images") parser.add_argument("file_path", help="Path to the document or image file to process") parser.add_argument("--method", choices=["ocr", "vision"], default="ocr", @@ -326,9 +319,10 @@ if __name__ == "__main__": parser.add_argument("--output", "-o", help="Path to save the output") parser.add_argument("--schema", choices=["image", "document", "product", "receipt"], default="document", help="Schema to use for vision method") parser.add_argument("--custom_prompt", help="Optional custom prompt for vision extraction", default=None) args = parser.parse_args() # Process based on method if args.method == "ocr": # Use Mistral OCR @@ -339,59 +333,43 @@ if __name__ == "__main__": output_format=args.format, save_to_file=args.output ) if result and not args.output: print(result) else: # Use vision LLM extraction model_classes = { "image": ImageData, "document": DocumentData, "product": ProductData, "receipt": ReceiptData } model_class = model_classes[args.schema] result = extract_structured_data_from_image( args.file_path, model_class, model_name=args.model or "ollama/llava-phi3", custom_prompt=args.custom_prompt, ) if args.output: save_structured_data(result, args.output) else: print(result.model_dump_json(indent=2)) ``` ### Key Updates - **Custom vs. Generated Prompts:** You can now override the generated prompt by passing a `--custom_prompt` parameter. (The code also includes a commented line that you can uncomment if you prefer a fixed prompt.) - **Consistent Base64 Encoding:** The `encode_image_to_base64` function is shared between the helper and extraction functions. - **Unified Vision Extraction:** The vision extraction function uses the same `litellm.completion` call as in your provided snippet, ensuring the image is sent in a proper base64 format. - **Command Line Flexibility:** Run the script from the command line using the provided examples, choosing between OCR and vision extraction modes. This updated script now brings together both methods with enhanced flexibility for your image and document processing needs. -
CraftsMan-Labs revised this gist
Mar 7, 2025 . 1 changed file with 196 additions and 56 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,19 +1,117 @@ Here's a Python script that combines both approaches - using Mistral OCR and the vision-capable LLM extraction approach: ```python import os import base64 import json import re from typing import List, Dict, Any, Optional, Union, Type, TypeVar from pydantic import BaseModel, Field from pathlib import Path # Type variable for Pydantic models T = TypeVar('T', bound=BaseModel) # ===== MISTRAL OCR IMPLEMENTATION ===== def process_document_with_mistral_ocr( file_path, api_key=None, model="mistral-ocr-latest", output_format="markdown", save_to_file=None ): """ Process a document using Mistral OCR API and return the results. Args: file_path (str): Path to the document file to process api_key (str, optional): Mistral API key. Defaults to MISTRAL_API_KEY environment variable. model (str, optional): Mistral OCR model to use. Defaults to "mistral-ocr-latest". output_format (str, optional): Output format - "markdown", "json", or "html". Defaults to "markdown". save_to_file (str, optional): Path to save the output. If None, returns the result. Returns: The OCR results in the specified format """ # Import here to make it optional from mistralai import Mistral # Get API key from environment if not provided if not api_key: api_key = os.environ.get("MISTRAL_API_KEY") if not api_key: raise ValueError("No API key provided and MISTRAL_API_KEY environment variable not set.") # Initialize Mistral client client = Mistral(api_key=api_key) # Process the file pdf_file = Path(file_path) print(f"Uploading file {pdf_file.name}...") # Upload the file uploaded_file = client.files.upload( file={ "file_name": pdf_file.stem, "content": pdf_file.read_bytes(), }, purpose="ocr", ) # Get signed URL for the uploaded file signed_url = client.files.get_signed_url(file_id=uploaded_file.id, expiry=1) print(f"Processing with OCR model: {model}...") # Process the document with OCR ocr_response = client.ocr.process( document={"type": "document_url", "document_url": signed_url.url}, model=model, include_image_base64=True, ) # Process the response based on the requested output format if output_format == "json": result = json.loads(ocr_response.model_dump_json()) else: # Get markdown content result = ocr_response.pages.markdown # Convert to HTML if requested if output_format == "html": import markdown result = markdown.markdown(result, extensions=['tables', 'fenced_code']) result = f""" OCR Result body {{ font-family: Arial, sans-serif; line-height: 1.6; max-width: 800px; margin: 0 auto; padding: 20px; }} img {{ max-width: 100%; }} pre {{ background-color: #f5f5f5; padding: 10px; overflow: auto; }} table {{ border-collapse: collapse; width: 100%; }} th, td {{ border: 1px solid #ddd; padding: 8px; }} th {{ background-color: #f2f2f2; }} {result} """ # Save to file if requested if save_to_file: with open(save_to_file, 'w', encoding='utf-8') as f: f.write(result) print(f"Results saved to {save_to_file}") return None return result # ===== VISION LLM EXTRACTION IMPLEMENTATION ===== def encode_image_to_base64(image_path): """Convert an image file to base64 encoding.""" with open(image_path, "rb") as image_file: @@ -77,6 +175,9 @@ def extract_structured_data_from_image( Returns: An instance of the provided model_class with extracted data """ # Import here to make it optional import litellm # Get base64 string of the image try: base64_image = encode_image_to_base64(image_path) @@ -132,11 +233,9 @@ def extract_structured_data_from_image( return model_class() return model_class() # ===== UTILITY FUNCTIONS ===== def save_structured_data(data: BaseModel, output_path: str): """Save structured data to a JSON file.""" with open(output_path, "w") as f: @@ -148,11 +247,9 @@ def read_structured_data(file_path: str, model_class: Type[T]) -> T: with open(file_path, "r") as f: json_data = f.read() return model_class.model_validate_json(json_data) # ===== PYDANTIC MODELS ===== class Person(BaseModel): """Data model for a person detected in an image.""" name: Optional[str] = None @@ -203,55 +300,98 @@ class ProductData(BaseModel): features: List[str] = Field(default_factory=list) condition: Optional[str] = None estimated_price_range: Optional[str] = None class ReceiptData(BaseModel): """Model for extracting information from receipts.""" store_name: Optional[str] = None date: Optional[str] = None total_amount: Optional[str] = None items: List[Dict[str, Any]] = Field(default_factory=list) payment_method: Optional[str] = None tax_amount: Optional[str] = None # ===== EXAMPLE USAGE ===== if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Extract data from documents and images") parser.add_argument("file_path", help="Path to the document or image file to process") parser.add_argument("--method", choices=["ocr", "vision"], default="ocr", help="Method to use: 'ocr' for Mistral OCR or 'vision' for vision LLM extraction") parser.add_argument("--api-key", help="API key (defaults to environment variable)") parser.add_argument("--model", help="Model to use (defaults based on method)") parser.add_argument("--format", choices=["markdown", "json", "html"], default="markdown", help="Output format for OCR method") parser.add_argument("--output", "-o", help="Path to save the output") parser.add_argument("--schema", choices=["image", "document", "product", "receipt"], default="document", help="Schema to use for vision method") args = parser.parse_args() # Process based on method if args.method == "ocr": # Use Mistral OCR result = process_document_with_mistral_ocr( args.file_path, api_key=args.api_key, model=args.model or "mistral-ocr-latest", output_format=args.format, save_to_file=args.output ) if result and not args.output: print(result) else: # Use vision LLM extraction # Select model class based on schema model_classes = { "image": ImageData, "document": DocumentData, "product": ProductData, "receipt": ReceiptData } model_class = model_classes[args.schema] # Extract data result = extract_structured_data_from_image( args.file_path, model_class, model_name=args.model or "ollama/llava-phi3" ) # Save or print result if args.output: save_structured_data(result, args.output) else: print(result.model_dump_json(indent=2)) ``` This script combines both approaches: 1. Mistral OCR for document processing with high accuracy 2. Vision-capable LLM extraction for structured data from images You can use it from the command line with various options: ```bash # Use Mistral OCR to process a PDF and output markdown python document_extraction.py document.pdf --method ocr --format markdown --output result.md # Use vision LLM to extract structured data from an image using the receipt schema python document_extraction.py receipt.jpg --method vision --schema receipt --output receipt_data.json ``` Or import the functions in your own code: ```python from document_extraction import process_document_with_mistral_ocr, extract_structured_data_from_image, ReceiptData # Process a document with Mistral OCR ocr_result = process_document_with_mistral_ocr("document.pdf", output_format="markdown") # Extract structured data from a receipt image receipt_data = extract_structured_data_from_image("receipt.jpg", ReceiptData) ``` The script provides flexibility to choose the appropriate method based on your needs and the type of document you're processing. -
CraftsMan-Labs revised this gist
Mar 7, 2025 . 1 changed file with 38 additions and 197 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,103 +1,8 @@ # Extracting Structured Data from Images Using Pydantic and LLMs This guide demonstrates how to create a system that uses vision-capable LLMs to extract structured data from images using Pydantic for data validation. ## Core Components ```python import litellm @@ -219,15 +124,17 @@ def extract_structured_data_from_image( structured_data = model_class.model_validate_json(json_str) return structured_data else: if attempt == max_retries: return model_class() except Exception as e: print(f"Error in attempt {attempt + 1}: {e}") if attempt == max_retries: return model_class() return model_class() ``` ## Utility Functions ```python def save_structured_data(data: BaseModel, output_path: str): @@ -243,11 +150,7 @@ def read_structured_data(file_path: str, model_class: Type[T]) -> T: return model_class.model_validate_json(json_data) ``` ## Pydantic Models ```python class Person(BaseModel): @@ -281,11 +184,7 @@ class ImageData(BaseModel): scene_description: SceneDescription = Field(default_factory=SceneDescription) text_in_image: Optional[List[str]] = None additional_notes: Optional[str] = None class DocumentData(BaseModel): """Data model for extracting information from document images.""" title: Optional[str] = None @@ -294,11 +193,7 @@ class DocumentData(BaseModel): content_summary: Optional[str] = None key_points: List[str] = Field(default_factory=list) entities_mentioned: List[str] = Field(default_factory=list) class ProductData(BaseModel): """Data model for extracting product information from images.""" product_name: Optional[str] = None @@ -310,25 +205,7 @@ class ProductData(BaseModel): estimated_price_range: Optional[str] = None ``` ## Example Usage ```python # Example usage @@ -348,69 +225,33 @@ if __name__ == "__main__": # Extract data with a different model document_data = extract_structured_data_from_image(image_path, DocumentData) print(document_data.model_dump_json(indent=2)) # Custom prompt example custom_prompt = """ Examine this image carefully and extract the following information: 1. Identify all people present 2. Describe their clothing in detail 3. List all visible objects 4. Note any text visible in the image Format the response as valid JSON according to the provided schema. """ data = extract_structured_data_from_image( image_path, ImageData, custom_prompt=custom_prompt ) # Custom model example class ReceiptData(BaseModel): """Model for extracting information from receipts.""" store_name: Optional[str] = None date: Optional[str] = None total_amount: Optional[str] = None items: List[Dict[str, Any]] = Field(default_factory=list) payment_method: Optional[str] = None tax_amount: Optional[str] = None receipt_data = extract_structured_data_from_image("receipt.jpg", ReceiptData) ``` -
CraftsMan-Labs revised this gist
Mar 7, 2025 . 1 changed file with 83 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -2,6 +2,89 @@ This comprehensive guide demonstrates how to create a robust system that uses Large Language Models (LLMs) with vision capabilities to extract structured data from images. By combining Pydantic's data validation with LLMs' image analysis capabilities, we can transform unstructured visual information into well-defined, type-safe Python objects. To create Mermaid JS diagrams that work well in GitHub Gists, you need to use the proper syntax within a code block with the "mermaid" language identifier. GitHub has native support for Mermaid diagrams, allowing you to create various visualizations directly in your Gists. ## Basic Mermaid Setup for GitHub Gists Simply wrap your Mermaid syntax in a code block with the `mermaid` language identifier: ```mermaid graph TD; A-->B; A-->C; B-->D; C-->D; ``` ## Common Diagram Types ### Flowchart ```mermaid flowchart LR A[Start] --> B{Decision} B -- Yes --> C[Process 1] B -- No --> D[Process 2] C --> E[End] D --> E ``` ### Sequence Diagram ```mermaid sequenceDiagram participant Client participant Server Client->>Server: Request Data Server->>Client: Response Note over Client,Server: Simple HTTP Request ``` ### Class Diagram ```mermaid classDiagram Animal <|-- Duck Animal <|-- Fish Animal: +int age Animal: +String gender class Duck{ +String beakColor +swim() +quack() } ``` ### Pie Chart ```mermaid pie title Distribution "Category A" : 42 "Category B" : 28 "Category C" : 30 ``` ### Git Graph ```mermaid gitGraph commit id: "initial" branch develop checkout develop commit id: "feature" checkout main merge develop tag: "v1.0.0" commit id: "hotfix" ``` ## Tips for GitHub Gists 1. Ensure each diagram has its own separate code block 2. Add semicolons at the end of each line for better compatibility 3. Keep diagrams simple as complex ones may not render correctly 4. Use the Mermaid Live Editor to test your diagrams before adding them to Gists ## Understanding the Need for Structured Image Data Visual information is inherently unstructured, making it challenging to process programmatically. Images contain rich information—people, objects, text, scenes—but extracting this data in a consistent, usable format requires sophisticated tools and techniques. The integration of Pydantic with vision-capable LLMs provides an elegant solution to this problem. -
CraftsMan-Labs revised this gist
Mar 7, 2025 . No changes.There are no files selected for viewing
-
CraftsMan-Labs revised this gist
Mar 7, 2025 . No changes.There are no files selected for viewing
-
CraftsMan-Labs revised this gist
Mar 7, 2025 . No changes.There are no files selected for viewing
-
CraftsMan-Labs created this gist
Mar 7, 2025 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,333 @@ # Extracting Structured Data from Images Using Pydantic and LLMs This comprehensive guide demonstrates how to create a robust system that uses Large Language Models (LLMs) with vision capabilities to extract structured data from images. By combining Pydantic's data validation with LLMs' image analysis capabilities, we can transform unstructured visual information into well-defined, type-safe Python objects. ## Understanding the Need for Structured Image Data Visual information is inherently unstructured, making it challenging to process programmatically. Images contain rich information—people, objects, text, scenes—but extracting this data in a consistent, usable format requires sophisticated tools and techniques. The integration of Pydantic with vision-capable LLMs provides an elegant solution to this problem. Pydantic offers robust data validation and schema generation, while LLMs can analyze and interpret visual content. Together, they form a powerful pipeline for extracting structured data from images that can be seamlessly integrated into your applications. ## Core Components of the Solution The solution consists of several key components that work together to transform image data into structured Pydantic objects: ### Base Image Data Extraction Function This function serves as the core component that processes an image and returns structured data based on a Pydantic model: ```python import litellm import base64 import json import re from typing import List, Dict, Any, Optional, Union, Type, TypeVar from pydantic import BaseModel, Field T = TypeVar('T', bound=BaseModel) def encode_image_to_base64(image_path): """Convert an image file to base64 encoding.""" with open(image_path, "rb") as image_file: return base64.b64encode(image_file.read()).decode('utf-8') def extract_json_from_text(text): """Extract JSON object from text that might contain additional content.""" # Try to find JSON in markdown code blocks json_match = re.search(r'``````', text) if json_match: json_str = json_match.group(1) else: # Try to find JSON based on braces json_match = re.search(r'({[\s\S]*})', text) if json_match: json_str = json_match.group(1) else: json_str = text # Clean the string and try to parse it json_str = json_str.strip() try: json.loads(json_str) return json_str except json.JSONDecodeError: return None def generate_extraction_prompt(model_class): """Generate a prompt based on a Pydantic model structure.""" schema = model_class.model_json_schema() prompt = "Analyze this image and extract information in JSON format.\n\n" prompt += f"Return the data according to this JSON schema:\n``````\n\n" prompt += "Important guidelines:\n" prompt += "1. Only return valid JSON that conforms to the schema\n" prompt += "2. If you're not sure about a field, use null instead of guessing\n" prompt += "3. Don't add any explanations outside the JSON structure\n" prompt += "4. Extract as much relevant information as possible\n" return prompt def extract_structured_data_from_image( image_path: str, model_class: Type[T], model_name: str = "ollama/llava-phi3", api_base: str = "http://localhost:11434", custom_prompt: Optional[str] = None, max_retries: int = 2 ) -> T: """ Extract structured data from an image based on a Pydantic model. Args: image_path: Path to the image file model_class: Pydantic model class defining the structure to extract model_name: Name of the vision model to use api_base: API endpoint for the model custom_prompt: Optional custom prompt to use instead of the generated one max_retries: Maximum number of retry attempts if parsing fails Returns: An instance of the provided model_class with extracted data """ # Get base64 string of the image try: base64_image = encode_image_to_base64(image_path) except Exception as e: print(f"Error encoding image: {e}") return model_class() # Generate or use the provided prompt prompt = custom_prompt if custom_prompt else generate_extraction_prompt(model_class) # Try extraction with retries for attempt in range(max_retries + 1): try: # Make the API call to the model response = litellm.completion( model=model_name, messages=[ { "role": "user", "content": [ { "type": "text", "text": prompt }, { "type": "image_url", "image_url": { "url": f"data:image/png;base64,{base64_image}" } } ] } ], api_base=api_base ) # Extract the text response response_text = response.choices[0].message.content # Extract JSON from the response json_str = extract_json_from_text(response_text) if json_str: # Try to parse the extracted JSON structured_data = model_class.model_validate_json(json_str) return structured_data else: if attempt = max_retries: return model_class() return model_class() ``` ### Utility Functions for Data Handling These functions help manage the extracted data: ```python def save_structured_data(data: BaseModel, output_path: str): """Save structured data to a JSON file.""" with open(output_path, "w") as f: f.write(data.model_dump_json(indent=2)) print(f"Data saved to {output_path}") def read_structured_data(file_path: str, model_class: Type[T]) -> T: """Read structured data from a JSON file into a Pydantic model.""" with open(file_path, "r") as f: json_data = f.read() return model_class.model_validate_json(json_data) ``` ## Defining Pydantic Models for Different Use Cases The power of this approach lies in its flexibility. By defining different Pydantic models, we can extract various types of structured data from images: ### General Image Analysis Model ```python class Person(BaseModel): """Data model for a person detected in an image.""" name: Optional[str] = None gender: Optional[str] = None approximate_age: Optional[str] = None clothing_description: Optional[str] = None position_in_image: Optional[str] = None class Object(BaseModel): """Data model for an object detected in an image.""" name: str color: Optional[str] = None size: Optional[str] = None position_in_image: Optional[str] = None quantity: Optional[int] = 1 class SceneDescription(BaseModel): """Data model for an overall scene description.""" setting: Optional[str] = None time_of_day: Optional[str] = None weather: Optional[str] = None general_mood: Optional[str] = None key_activities: Optional[List[str]] = None class ImageData(BaseModel): """Overall data model for information extracted from an image.""" persons: List[Person] = Field(default_factory=list) objects: List[Object] = Field(default_factory=list) scene_description: SceneDescription = Field(default_factory=SceneDescription) text_in_image: Optional[List[str]] = None additional_notes: Optional[str] = None ``` ### Document Analysis Model ```python class DocumentData(BaseModel): """Data model for extracting information from document images.""" title: Optional[str] = None date: Optional[str] = None document_type: Optional[str] = None content_summary: Optional[str] = None key_points: List[str] = Field(default_factory=list) entities_mentioned: List[str] = Field(default_factory=list) ``` ### Product Information Model ```python class ProductData(BaseModel): """Data model for extracting product information from images.""" product_name: Optional[str] = None brand: Optional[str] = None category: Optional[str] = None color: Optional[str] = None features: List[str] = Field(default_factory=list) condition: Optional[str] = None estimated_price_range: Optional[str] = None ``` ## Implementation Details and Technical Considerations The implementation includes several important technical aspects that ensure reliability and robustness: ### Prompt Engineering The system automatically generates effective prompts based on the Pydantic model's JSON schema. This ensures the LLM understands exactly what information to extract and in what format. The prompt generator creates detailed instructions that guide the model to produce properly structured outputs. ### Error Handling and Retries The extraction function implements a retry mechanism that attempts to recover from parsing failures. If the LLM produces invalid or unparseable JSON, the system will retry with a more explicit prompt that emphasizes the need for valid JSON output. ### JSON Extraction from Messy Responses LLMs sometimes include explanatory text or markdown formatting in their responses. The `extract_json_from_text` function uses regular expressions to find and extract valid JSON from potentially messy responses, making the system more robust. ## Using the System in Practice Here's how to use the system to extract structured data from images: ```python # Example usage if __name__ == "__main__": # Define your image path image_path = "path/to/your/image.jpg" # Extract general image data image_data = extract_structured_data_from_image(image_path, ImageData) # Print the extracted data print(image_data.model_dump_json(indent=2)) # Save the extracted data save_structured_data(image_data, "extracted_image_data.json") # Extract data with a different model document_data = extract_structured_data_from_image(image_path, DocumentData) print(document_data.model_dump_json(indent=2)) ``` ## Advanced Features and Customization The system is designed to be flexible and customizable: ### Custom Prompts While the default prompt generator works well for most cases, you can supply custom prompts to guide the extraction process more specifically: ```python custom_prompt = """ Examine this image carefully and extract the following information: 1. Identify all people present 2. Describe their clothing in detail 3. List all visible objects 4. Note any text visible in the image Format the response as valid JSON according to the provided schema. """ data = extract_structured_data_from_image( image_path, ImageData, custom_prompt=custom_prompt ) ``` ### Custom Model Definitions You can create custom Pydantic models to extract exactly the information you need: ```python class ReceiptData(BaseModel): """Model for extracting information from receipts.""" store_name: Optional[str] = None date: Optional[str] = None total_amount: Optional[str] = None items: List[Dict[str, Any]] = Field(default_factory=list) payment_method: Optional[str] = None tax_amount: Optional[str] = None receipt_data = extract_structured_data_from_image("receipt.jpg", ReceiptData) ``` ## Technical Implementation Details The solution leverages several key technologies: 1. **Pydantic** for data validation and schema generation[1][5][7] 2. **LiteLLM** as a client for interacting with LLM APIs 3. **Regular Expressions** for extracting JSON from model responses 4. **Type Hints** to ensure code correctness and enable better IDE support The implementation follows best practices for error handling, retries, and data validation. It uses Pydantic's JSON validation capabilities to ensure the extracted data conforms to the expected structure[4][6]. ## Conclusion The integration of Pydantic with vision-capable LLMs provides a powerful framework for extracting structured data from images. This approach offers several advantages: 1. **Type Safety**: The extracted data is guaranteed to match the defined structure. 2. **Flexibility**: Custom models can be defined for different extraction needs. 3. **Robustness**: Error handling and retry mechanisms ensure reliable operation. 4. **Ease of Use**: The extracted data is immediately available as Python objects. By following the approach outlined in this guide, you can transform unstructured visual information into structured, validated data that can be easily integrated into your applications and workflows. This enables numerous applications, from document processing to scene analysis, product recognition, and beyond.