Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import CLIPProcessor, CLIPModel | |
| from PIL import Image | |
| import torch | |
| model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") | |
| processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") | |
| def generate_caption(image): | |
| if image is None: | |
| return "No image uploaded." | |
| # Candidate text prompts | |
| texts = [ | |
| "a photo of a cat", | |
| "a photo of a dog", | |
| "a photo of a man", | |
| "a photo of a woman", | |
| "a photo of a laptop", | |
| "a photo of a smartphone", | |
| "a photo of a city", | |
| "a photo of a landscape", | |
| "a photo of food", | |
| "a photo of a car" | |
| ] | |
| inputs = processor(text=texts, images=image, return_tensors="pt", padding=True) | |
| outputs = model(**inputs) | |
| logits_per_image = outputs.logits_per_image # image-text similarity scores | |
| probs = logits_per_image.softmax(dim=1) # convert to probabilities | |
| best_match = torch.argmax(probs).item() | |
| caption = texts[best_match] | |
| return f"Best match: {caption} (Confidence: {probs[0][best_match].item():.2f})" | |
| iface = gr.Interface( | |
| fn=generate_caption, | |
| inputs=gr.Image(type="pil"), | |
| outputs=gr.Textbox(label="Generated Caption"), | |
| title="Image Captioning with CLIP", | |
| description="Upload an image and get a dynamically generated caption using CLIP.<br><br><b>Detectable categories:</b><br><div style='display: flex; gap: 40px;'><div><ul><li>a photo of a cat</li><li>a photo of a dog</li><li>a photo of a man</li><li>a photo of a woman</li><li>a photo of a laptop</li></ul></div><div><ul><li>a photo of a smartphone</li><li>a photo of a city</li><li>a photo of a landscape</li><li>a photo of food</li><li>a photo of a car</li></ul></div></div>" | |
| ) | |
| iface.launch() | |