#!/usr/bin/env python3 """ extract-financial-disclosure.py Parses and extracts structured data from the screenshot at the given URL: https://gist.github.com/user-attachments/assets/9c35e7a4-e6b7-4d5b-a4a2-a62b6ec28504 Full financial disclosure report: https://disclosures-clerk.house.gov/public_disc/financial-pdfs/2023/10059734.pdf This script assumes your API key is set up in the default way, i.e. environment variable: $OPENAI_API_KEY https://help.openai.com/en/articles/5112595-best-practices-for-api-key-safety """ import base64 import json from openai import OpenAI from pathlib import Path from pydantic import BaseModel, Field from typing import Union INPUT_URL = "https://gist.github.com/user-attachments/assets/9c35e7a4-e6b7-4d5b-a4a2-a62b6ec28504" # OpenAI examples of Stuctured Output scripts and data definitions # https://platform.openai.com/docs/guides/structured-outputs/examples?context=ex2 # Define the data structures in Pydantic: # a Disclosure Report has a list of assets class Asset(BaseModel): asset_name: str owner: str location: Union[str, None] asset_value_low: Union[int, None] asset_value_high: Union[int, None] income_type: str income_low: Union[int, None] income_high: Union[int, None] tx_gt_1000: bool class DisclosureReport(BaseModel): assets: list[Asset] ## initialize OpenAI client client = OpenAI() # Example of message format for passing in an image via URL # https://cookbook.openai.com/examples/gpt4o/introduction_to_gpt4o#url-image-processing input_messages = [ {"role": "system", "content": "Output the result in JSON format."}, { "role": "user", "content": [ {"type": "text", "text": "Extract the text from this image"}, { "type": "image_url", "image_url": {"url": INPUT_URL}, }, ], }, ] # gpt-4o-mini is cheap and fast and has vision capabilities response = client.beta.chat.completions.parse( response_format=DisclosureReport, model="gpt-4o-mini", messages=input_messages ) message = response.choices[0].message # Print it out in readable format obj = json.loads(message.content) print(json.dumps(obj, indent=2))