# /// script
# requires-python = ">=3.12"
# dependencies = [
#     "torch",
#     "transformers",
#     "Pillow",
# ]
# ///

from pathlib import Path
import torch
from transformers import pipeline

# List of models to test
MODELS = [
    "ydshieh/vit-gpt2-coco-en",
    "Salesforce/blip-image-captioning-large",
    "microsoft/git-base-coco",
]

def process_image(image_path: Path, models):
    """Process a single image through multiple models."""
    print(f"\n{image_path.name}")
    
    for model_name in models:
        try:
            # Initialize model
            captioner = pipeline(model=model_name, device=0 if torch.cuda.is_available() else -1)
            
            # Generate caption
            caption = captioner(str(image_path))
            
            # Get caption text
            caption_text = caption[0]['generated_text'] if isinstance(caption, list) else caption
            print(f"{model_name}: {caption_text}")
            
        except Exception as e:
            print(f"{model_name}: Error: {str(e)}")

def main():
    """Process all images in current directory."""
    # Get all image files in current directory
    image_files = list(Path().glob('*.jpg')) + list(Path().glob('*.png'))
    
    if not image_files:
        print("No image files (jpg/png) found in current directory!")
        return
    
    # Process each image
    for img_path in image_files:
        process_image(img_path, MODELS)

if __name__ == '__main__':
    main()