Spaces:
Sleeping
Sleeping
File size: 2,187 Bytes
882238c 211e952 4985a13 882238c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | # Vision tool using Groq's Meta-Llama Scout model
from smolagents import tool
from groq import Groq
import os
def _llama_analyze(image_b64: str, prompt: str) -> str:
"""Internal helper to query the Llama vision model."""
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}},
],
}
]
response = client.chat.completions.create(
#model="meta-llama/llama-4-scout-17b-16e-instruct",
model ="qwen/qwen-qwq-32b",
messages=messages,
stream=False,
max_completion_tokens=512,
)
return response.choices[0].message.content
@tool
def image_reasoning_tool(image_file: str, prompt: str | None = None) -> dict:
"""Perform OCR and optional vision analysis on an image.
This single entry point unifies OCR extraction and Llama vision reasoning so
the planner only sees one image tool.
Args:
image_file: Path to the image file to analyze.
prompt: Optional instruction for the vision model. If omitted, only OCR
is performed.
Returns:
Dictionary with OCR text, base64 image data and optional vision model
response.
"""
try:
from PIL import Image
from smolagents.utils import encode_image_base64
import pytesseract
image = Image.open(image_file)
b64 = encode_image_base64(image)
ocr_text = pytesseract.image_to_string(image)
vision_text = ""
if prompt:
try:
vision_text = _llama_analyze(b64, prompt)
except Exception as e: # vision errors shouldn't break OCR result
vision_text = f"Error processing image with vision model: {e}"
return {"ocr_text": ocr_text, "vision_text": vision_text, "base64_image": b64}
except Exception as e:
return {
"ocr_text": "",
"vision_text": "",
"base64_image": "",
"error": str(e),
}
|