Spaces:
Sleeping
Sleeping
| from functools import lru_cache | |
| import time | |
| import gradio as gr | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline | |
| MODEL_OPTIONS = { | |
| "SmolLM2 360M Instruct (best default)": "HuggingFaceTB/SmolLM2-360M-Instruct", | |
| "SmolLM2 135M Instruct (fast)": "HuggingFaceTB/SmolLM2-135M-Instruct", | |
| "distilgpt2 (baseline)": "distilgpt2", | |
| } | |
| DEFAULT_MODEL = "SmolLM2 360M Instruct (best default)" | |
| INSTRUCT_MODEL_LABELS = { | |
| "SmolLM2 360M Instruct (best default)", | |
| "SmolLM2 135M Instruct (fast)", | |
| } | |
| VIEWPOINT_GUIDES = { | |
| "close-up": ( | |
| "Focus on nearby detail, texture, facial expression, small objects, and " | |
| "what is cropped out or hidden by the tight framing." | |
| ), | |
| "wide shot": ( | |
| "Focus on layout, background, scale, distance between objects, and how " | |
| "the whole scene is arranged." | |
| ), | |
| "bird's-eye view": ( | |
| "Describe the scene from above. Focus on map-like layout, paths, shapes, " | |
| "and what becomes visible only from overhead." | |
| ), | |
| "low angle": ( | |
| "Describe the scene from below. Focus on height, scale, foreground, " | |
| "dominance, sky or ceiling, and what is hidden behind tall objects." | |
| ), | |
| "over-the-shoulder": ( | |
| "Describe what is visible from behind one character or object. Focus on " | |
| "foreground shoulder/frame, partial visibility, and what the viewer can " | |
| "infer but not fully see." | |
| ), | |
| } | |
| MODE_GUIDES = { | |
| "cinematic shot description": ( | |
| "Write like a film shot description, emphasizing framing, movement, and " | |
| "what the viewer sees first." | |
| ), | |
| "photography caption": ( | |
| "Write like a precise photography caption, emphasizing composition and " | |
| "visible details." | |
| ), | |
| "storyboard note": ( | |
| "Write like a storyboard note for an artist, naming visual beats and " | |
| "spatial relationships." | |
| ), | |
| "image prompt helper": ( | |
| "Write a detailed image-generation prompt that makes the viewpoint and " | |
| "composition explicit." | |
| ), | |
| "visual analysis paragraph": ( | |
| "Write an analytical paragraph explaining how the viewpoint changes " | |
| "what is visible and what is hidden." | |
| ), | |
| } | |
| FIVE_VIEWPOINTS = [ | |
| "close-up", | |
| "wide shot", | |
| "bird's-eye view", | |
| "low angle", | |
| "over-the-shoulder", | |
| ] | |
| try: | |
| torch.set_num_threads(2) | |
| except Exception: | |
| pass | |
| def load_generator(model_label): | |
| model_id = MODEL_OPTIONS[model_label] | |
| tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| if tokenizer.pad_token_id is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32) | |
| model.eval() | |
| return pipeline( | |
| "text-generation", | |
| model=model, | |
| tokenizer=tokenizer, | |
| device=-1, | |
| ) | |
| def build_prompt(model_label, scene, viewpoint, output_mode): | |
| scene = scene.strip() | |
| viewpoint_guide = VIEWPOINT_GUIDES[viewpoint] | |
| mode_guide = MODE_GUIDES[output_mode] | |
| if model_label not in INSTRUCT_MODEL_LABELS: | |
| return ( | |
| f"{viewpoint.title()} {output_mode}.\n" | |
| f"Scene: {scene}\n" | |
| "Description:" | |
| ) | |
| return ( | |
| "You are a careful visual scene description assistant for a student " | |
| "research project.\n" | |
| "Describe the same scene from a selected viewpoint. The important question " | |
| "is not just camera vocabulary; explain what becomes visible, hidden, " | |
| "larger, smaller, foregrounded, or backgrounded because of the viewpoint.\n\n" | |
| f"Viewpoint: {viewpoint}\n" | |
| f"Viewpoint guidance: {viewpoint_guide}\n" | |
| f"Output mode: {output_mode}\n" | |
| f"Output guidance: {mode_guide}\n" | |
| f"Scene: {scene}\n\n" | |
| "Write the response now:" | |
| ) | |
| def call_model(model_label, final_prompt, temperature, top_p, max_new_tokens): | |
| generator = load_generator(model_label) | |
| tokenizer = generator.tokenizer | |
| result = generator( | |
| final_prompt, | |
| max_new_tokens=int(max_new_tokens), | |
| temperature=max(float(temperature), 0.05), | |
| top_p=float(top_p), | |
| do_sample=True, | |
| repetition_penalty=1.08, | |
| return_full_text=False, | |
| pad_token_id=tokenizer.eos_token_id, | |
| eos_token_id=tokenizer.eos_token_id, | |
| ) | |
| text = result[0]["generated_text"].strip() | |
| return text if text else "(The model returned an empty response. Try more tokens.)" | |
| def generate_viewpoint( | |
| model_label, | |
| scene, | |
| viewpoint, | |
| output_mode, | |
| temperature, | |
| top_p, | |
| max_new_tokens, | |
| ): | |
| if not scene or not scene.strip(): | |
| return "Please enter a scene.", "", "" | |
| final_prompt = build_prompt(model_label, scene, viewpoint, output_mode) | |
| started = time.perf_counter() | |
| try: | |
| output = call_model( | |
| model_label, | |
| final_prompt, | |
| temperature, | |
| top_p, | |
| max_new_tokens, | |
| ) | |
| except Exception as exc: | |
| return ( | |
| f"Error while running the model: {exc}", | |
| final_prompt, | |
| "Try the fast model first, or reduce max tokens.", | |
| ) | |
| elapsed = time.perf_counter() - started | |
| note = ( | |
| f"Model: {MODEL_OPTIONS[model_label]}\n" | |
| f"Elapsed: {elapsed:.1f} seconds\n" | |
| "First use can be slower because the model has to download and load." | |
| ) | |
| return output, final_prompt, note | |
| def make_paper_notes(scene, outputs_text): | |
| scene_line = scene.strip() if scene and scene.strip() else "the tested scene" | |
| return ( | |
| f"Paper notes for: {scene_line}\n\n" | |
| "Use these checks while reading the outputs:\n\n" | |
| "1. Visibility: Which objects become visible or hidden in each viewpoint?\n" | |
| "2. Occlusion: Does the model notice when one object blocks another?\n" | |
| "3. Scale: Does low angle or close-up change perceived size or importance?\n" | |
| "4. Layout: Does bird's-eye or wide shot explain spatial relationships?\n" | |
| "5. Specificity: Does the model describe this scene, or could the paragraph " | |
| "fit almost any scene?\n" | |
| "6. Finding sentence: Write one cautious sentence about whether the model " | |
| "understands viewpoint consequences or only uses camera-angle words.\n\n" | |
| "Useful wording for the paper:\n" | |
| "In this small test, the model was strongest when ____. It was weakest " | |
| "when ____. The clearest limitation was ____." | |
| ) | |
| def run_five_viewpoints(model_label, scene, output_mode, temperature, top_p, max_new_tokens): | |
| if not scene or not scene.strip(): | |
| return "Please enter a scene.", "" | |
| started = time.perf_counter() | |
| sections = [] | |
| try: | |
| for viewpoint in FIVE_VIEWPOINTS: | |
| final_prompt = build_prompt(model_label, scene, viewpoint, output_mode) | |
| output = call_model( | |
| model_label, | |
| final_prompt, | |
| temperature, | |
| top_p, | |
| max_new_tokens, | |
| ) | |
| sections.append(f"## {viewpoint.title()}\n\n{output}") | |
| except Exception as exc: | |
| return ( | |
| f"Error while running the five-viewpoint test: {exc}", | |
| "Try the fast model first, or reduce max tokens.", | |
| ) | |
| elapsed = time.perf_counter() - started | |
| outputs_text = "\n\n---\n\n".join(sections) | |
| notes = make_paper_notes(scene, outputs_text) + f"\n\nElapsed: {elapsed:.1f} seconds." | |
| return outputs_text, notes | |
| def notes_from_pasted_outputs(scene, pasted_outputs): | |
| if not pasted_outputs or not pasted_outputs.strip(): | |
| return "Paste your generated outputs first." | |
| return make_paper_notes(scene, pasted_outputs) | |
| with gr.Blocks(title="Camera Angle Model Lab", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown( | |
| "# Camera Angle Model Lab\n" | |
| "CPU-only viewpoint lab for testing how small language models describe " | |
| "the same scene from different visual perspectives. No API tokens or paid " | |
| "compute required. The first run may take a minute while the model loads." | |
| ) | |
| with gr.Tab("Single Viewpoint Writer"): | |
| with gr.Row(): | |
| model_one = gr.Dropdown( | |
| choices=list(MODEL_OPTIONS.keys()), | |
| value=DEFAULT_MODEL, | |
| label="Model", | |
| ) | |
| viewpoint_one = gr.Dropdown( | |
| choices=list(VIEWPOINT_GUIDES.keys()), | |
| value="close-up", | |
| label="Viewpoint", | |
| ) | |
| mode_one = gr.Dropdown( | |
| choices=list(MODE_GUIDES.keys()), | |
| value="visual analysis paragraph", | |
| label="Output mode", | |
| ) | |
| scene_one = gr.Textbox( | |
| label="Scene", | |
| lines=4, | |
| value="A dog hides under a kitchen table while a child looks for it.", | |
| ) | |
| with gr.Row(): | |
| temperature_one = gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Temperature") | |
| top_p_one = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p") | |
| max_tokens_one = gr.Slider(40, 170, value=100, step=10, label="Max new tokens") | |
| run_one = gr.Button("Generate", variant="primary") | |
| output_one = gr.Textbox(label="Generated output", lines=10) | |
| prompt_sent_one = gr.Textbox(label="Prompt sent to model", lines=8) | |
| note_one = gr.Textbox(label="Run note", lines=3) | |
| run_one.click( | |
| fn=generate_viewpoint, | |
| inputs=[ | |
| model_one, | |
| scene_one, | |
| viewpoint_one, | |
| mode_one, | |
| temperature_one, | |
| top_p_one, | |
| max_tokens_one, | |
| ], | |
| outputs=[output_one, prompt_sent_one, note_one], | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| ["A dog hides under a kitchen table while a child looks for it.", "close-up", "visual analysis paragraph"], | |
| ["A crowded city street after rain reflects neon signs in puddles.", "bird's-eye view", "cinematic shot description"], | |
| ["A soccer player prepares to take a penalty kick while the goalkeeper waits.", "low angle", "storyboard note"], | |
| ["A person stands at the edge of a forest path holding a lantern.", "over-the-shoulder", "image prompt helper"], | |
| ["A museum gallery contains one bright painting at the far end of the room.", "wide shot", "photography caption"], | |
| ], | |
| inputs=[scene_one, viewpoint_one, mode_one], | |
| ) | |
| with gr.Tab("Five-Viewpoint Test"): | |
| model_grid = gr.Dropdown( | |
| choices=list(MODEL_OPTIONS.keys()), | |
| value=DEFAULT_MODEL, | |
| label="Model", | |
| ) | |
| scene_grid = gr.Textbox( | |
| label="Shared scene", | |
| lines=4, | |
| value="A dog hides under a kitchen table while a child looks for it.", | |
| ) | |
| mode_grid = gr.Dropdown( | |
| choices=list(MODE_GUIDES.keys()), | |
| value="visual analysis paragraph", | |
| label="Output mode", | |
| ) | |
| with gr.Row(): | |
| temperature_grid = gr.Slider(0.1, 1.5, value=0.6, step=0.1, label="Temperature") | |
| top_p_grid = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p") | |
| max_tokens_grid = gr.Slider(40, 140, value=80, step=10, label="Max new tokens") | |
| run_grid = gr.Button("Run Five Viewpoints", variant="primary") | |
| grid_output = gr.Markdown(label="Five-viewpoint output") | |
| grid_notes = gr.Textbox(label="Paper notes", lines=14) | |
| run_grid.click( | |
| fn=run_five_viewpoints, | |
| inputs=[ | |
| model_grid, | |
| scene_grid, | |
| mode_grid, | |
| temperature_grid, | |
| top_p_grid, | |
| max_tokens_grid, | |
| ], | |
| outputs=[grid_output, grid_notes], | |
| ) | |
| with gr.Tab("Paper Notes Helper"): | |
| scene_notes = gr.Textbox( | |
| label="Scene being tested", | |
| lines=3, | |
| value="A dog hides under a kitchen table while a child looks for it.", | |
| ) | |
| pasted_outputs = gr.Textbox( | |
| label="Paste generated outputs here", | |
| lines=12, | |
| placeholder="Paste close-up, wide shot, bird's-eye, low angle, and over-the-shoulder outputs here.", | |
| ) | |
| run_notes = gr.Button("Make Paper Notes", variant="primary") | |
| paper_notes = gr.Textbox(label="Checklist for findings section", lines=14) | |
| run_notes.click( | |
| fn=notes_from_pasted_outputs, | |
| inputs=[scene_notes, pasted_outputs], | |
| outputs=paper_notes, | |
| ) | |
| gr.Markdown( | |
| "### Duplication note\n" | |
| "This Space uses only local CPU models. No tokens, API keys, or paid " | |
| "hardware are required. Students can duplicate it and edit the viewpoints, " | |
| "output modes, examples, or model list." | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |