import base64 from io import BytesIO from PIL import Image import torch import torch.nn as nn import ollama from diffusers import DiffusionPipeline, StableDiffusionPipeline from safetensors.torch import load_file from transformers import CLIPProcessor, CLIPModel, AutoImageProcessor, AutoModel import numpy as np device = torch.device('cuda' if torch.cuda.is_available() else "cpu") diffusion_model = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", custom_pipeline="lpw_stable_diffusion", torch_dtype=torch.float16).to(device) def generate_image(prompt, image_filename="output.png"): # https://github.com/huggingface/diffusers/tree/main/examples/community#long-prompt-weighting-stable-diffusion image = diffusion_model(prompt=prompt, width=512, height=512, max_embeddings_multiples=3).images[0] image.save(image_filename) return image def generate_image_embedding_with_clip(image): clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device) with torch.no_grad(): inputs = clip_processor(images=image, return_tensors="pt").to(device) image_features = clip_model.get_image_features(**inputs) return image_features[0] def generate_image_embedding_with_dino(image): dino_processor = AutoImageProcessor.from_pretrained('facebook/dinov2-base') dino_model = AutoModel.from_pretrained('facebook/dinov2-base').to(device) with torch.no_grad(): inputs = dino_processor(images=image, return_tensors="pt").to(device) outputs = dino_model(**inputs) image_features = outputs.last_hidden_state.mean(dim=1) return image_features[0] def compare_captions(image, short_caption, long_caption): short_caption_image = generate_image(short_caption, "short_caption_image.jpg") long_caption_image = generate_image(long_caption, "long_caption_image.jpg") image_embedding_dino = generate_image_embedding_with_dino(image) short_caption_image_embedding_dino = generate_image_embedding_with_dino(short_caption_image) long_caption_image_embedding_dino = generate_image_embedding_with_dino(long_caption_image) short_score_dino = calc_cosine_similarity(image_embedding_dino, short_caption_image_embedding_dino) long_score_dino = calc_cosine_similarity(image_embedding_dino, long_caption_image_embedding_dino) print(short_score_dino, long_score_dino) image_embedding_clip = generate_image_embedding_with_clip(image) short_caption_image_embedding_clip = generate_image_embedding_with_clip(short_caption_image) long_caption_image_embedding_clip = generate_image_embedding_with_clip(long_caption_image) short_score_clip = calc_cosine_similarity(image_embedding_clip, short_caption_image_embedding_clip) long_score_clip = calc_cosine_similarity(image_embedding_clip, long_caption_image_embedding_clip) print(short_score_clip, long_score_clip) def calc_cosine_similarity(embedding_1, embedding_2): cos = nn.CosineSimilarity(dim=0) sim = cos(embedding_1, embedding_2).item() return (sim + 1) / 2 if __name__ == "__main__": image = Image.open("image.jpg") short_caption = "how to build an industry for dollars" long_caption = "In the image, there is a small black house with a green roof situated in a grassy area surrounded by trees. The house appears to be under construction or renovation, as there are various tools and materials visible around it, such as a hammer, nails, screws, and wood planks. The presence of these objects indicates that the house is being built or repaired, and the green roof adds a unique and eco-friendly feature to the structure." compare_captions(image, short_caption, long_caption)