import gradio as gr from PIL import Image import torch from transformers import AutoProcessor, AutoModelForCausalLM, AutoConfig # ========================= # Model Setup & Patch # ========================= model_id = 'microsoft/Florence-2-large' device = "cuda" if torch.cuda.is_available() else "cpu" # PATCH: Explicitly handle the Florence2 configuration bug config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) if not hasattr(config, 'forced_bos_token_id'): config.forced_bos_token_id = None # Load model and processor model = AutoModelForCausalLM.from_pretrained( model_id, config=config, trust_remote_code=True ).to(device).eval() processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) def run_ocr(image): if image is None: return "⚠️ Please upload an image." # Using or task for better text flow # Florence-2 works best with these specific task tags prompt = "" inputs = processor(text=prompt, images=image, return_tensors="pt").to(device) with torch.no_grad(): generated_ids = model.generate( input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_new_tokens=1024, do_sample=False, num_beams=3 ) generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] # Clean up the output parsed_answer = processor.post_process_generation( generated_text, task=prompt, image_size=(image.width, image.height) ) return parsed_answer[prompt] # ========================= # Gradio UI # ========================= with gr.Blocks() as demo: gr.Markdown("## 🖋️ Handwritten Note to Text (Florence-2)") with gr.Row(): input_img = gr.Image(type="pil") output_text = gr.Textbox(label="Extracted Text", lines=10) btn = gr.Button("Convert to Text", variant="primary") btn.click(fn=run_ocr, inputs=input_img, outputs=output_text) if __name__ == "__main__": demo.launch()