import torch from transformers import pipeline import numpy as np import gradio as gr from PIL import Image, ImageDraw import scipy.io.wavfile as wavfile import tempfile # 1. Loading Models # Use 'cuda' if you have a GPU, otherwise 'cpu' device = "cuda" if torch.cuda.is_available() else "cpu" narrator = pipeline( 'text-to-speech', # Note: 'text-to-audio' works, but 'text-to-speech' is standard model="facebook/mms-tts-eng", device=device ) the_detector = pipeline( "object-detection", model="facebook/detr-resnet-50", device=device ) # 2. The Boundary Maker def draw_bounding_boxes(image, detections): draw_image = image.copy() draw = ImageDraw.Draw(draw_image) for detection in detections: box = detection['box'] xmin, ymin, xmax, ymax = box['xmin'], box['ymin'], box['xmax'], box['ymax'] # Draw elliptical boundary draw.ellipse([(xmin, ymin), (xmax, ymax)], outline='red', width=3) label = detection['label'] score = detection['score'] text = f"{label} {score:.2f}" # Draw a small background for the text draw.rectangle([(xmin, ymin - 15), (xmin + 80, ymin)], fill='red') draw.text((xmin + 2, ymin - 15), text, fill='white') return draw_image # 3. The Audio Artist (Fixed for Gradio Compatibility) def generate_audio(text): audio_data = narrator(text) waveform = audio_data['audio'] sampling_rate = audio_data['sampling_rate'] # Standardize waveform shape waveform = np.squeeze(waveform) # Save to a temporary file for Gradio to read temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") wavfile.write(temp_file.name, sampling_rate, waveform) return temp_file.name # 4. Generate Natural Text def read_objects(detection_objects): if not detection_objects: return "I couldn't find any objects in this picture." object_counts = {} for detection in detection_objects: label = detection['label'] object_counts[label] = object_counts.get(label, 0) + 1 response = 'This picture contains' labels = list(object_counts.keys()) for i, label in enumerate(labels): count = object_counts[label] plural_label = f"{label}s" if count > 1 else label if i == len(labels) - 1 and len(labels) > 1: response += f" and {count} {plural_label}." else: response += f" {count} {plural_label}" + ("," if len(labels) > 2 and i < len(labels)-2 else "") if len(labels) == 1: response += "." return response # 5. The Collaborator def collaborator(img): output = the_detector(img) gen_image = draw_bounding_boxes(image=img, detections=output) natural_text = read_objects(output) audio_path = generate_audio(natural_text) return gen_image, audio_path # 6. UI Interface demo = gr.Interface( fn=collaborator, inputs=gr.Image(label='Upload Image', type='pil'), outputs=[ gr.Image(label='Detected Objects', type='pil'), gr.Audio(label='Description Audio') ], title='VisionTalk: Object Detector with Audio', description='Upload an image to see what objects are inside and hear a generated description.' ) if __name__ == "__main__": demo.launch()