Spaces:
Sleeping
Sleeping
File size: 5,835 Bytes
51abc3c 2253aff 51abc3c 2253aff 51abc3c 7118cc4 51abc3c 391ac5c 56e22f3 391ac5c 2253aff b4c715a 2253aff 7118cc4 51abc3c 623cd35 51abc3c e6f58ab 51abc3c e6f58ab b4c715a 51abc3c 2253aff 51abc3c 551ab80 51abc3c 9ea452b 51abc3c e6f58ab 51abc3c a8a0099 51abc3c a8a0099 51abc3c a8a0099 51abc3c a8a0099 51abc3c c9fdc9d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 | """High level multi-agent system powered by OpenRouter models.
This module sets up a manager agent that delegates tasks to specialized
web and information agents. It relies on the ``smolagent`` framework and
OpenRouter API models for language generation and verification.
"""
from smolagents import (
CodeAgent,
VisitWebpageTool,
WebSearchTool,
WikipediaSearchTool,
PythonInterpreterTool,
FinalAnswerTool,
OpenAIServerModel,
Tool,
)
from smolagents.utils import encode_image_base64, make_image_url
#from vision_tool import image_reasoning_tool
import os
HF_API_TOKEN = os.getenv("HF_API_TOKEN")
#audio_transcribe_tool = Tool.from_space(
# space_id = "hf-audio/whisper-large-v3",
# name = "audio_to_text",
# description = "Transcribe long-form YouTube videos or audio inputs. Paste the URL to a YouTube video or upload audio file to get the transcript.",
#)
#object_detection_tool = Tool.from_space(
# space_id = "stevengrove/YOLO-World",
# name = "Real-Time Open-Vocabulary Object Detector",
# description = "Detect objects in images or videos."
#)
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
if not OPENROUTER_API_KEY:
raise EnvironmentError("OPENROUTER_API_KEY environment variable not set")
common = dict(
api_base="https://openrouter.ai/api/v1",
api_key=OPENROUTER_API_KEY,
#extra_body={"usage": {"include": True}}
)
class MultiAgentSystem:
"""Coordinates specialized agents and their underlying models.
The system instantiates a ``web_agent`` for browsing and data collection,
an ``info_agent`` for computation and image reasoning, and a
``manager_agent`` that plans tasks and verifies answers using several
OpenRouter models.
"""
def __init__(self):
self.deepseek_model = OpenAIServerModel(
model_id="deepseek/deepseek-r1-0528:free",
**common,
)
self.qwen_model = OpenAIServerModel(
model_id="qwen/qwen-2.5-coder-32b-instruct:free",
**common,
)
self.gemini_model = OpenAIServerModel(
model_id="google/gemini-2.0-flash-exp:free",
**common,
)
self.web_agent = CodeAgent(
model =self.qwen_model,
tools=[WebSearchTool(), VisitWebpageTool(), WikipediaSearchTool()],
name="web_agent",
description=(
"You are a web browsing agent. Whenever the given {task} involves browsing "
"the web or a specific website such as Wikipedia or YouTube, you will use "
"the provided tools. For web-based factual and retrieval tasks, be as precise and source-reliable as possible."
),
additional_authorized_imports=[
"markdownify",
"json",
"requests",
"urllib.request",
"urllib.parse",
"wikipedia-api",
],
verbosity_level=0,
max_steps=10,
)
self.info_agent = CodeAgent(
model =self.qwen_model,
tools=[PythonInterpreterTool()],
name="info_agent",
description=(
"You are an agent tasked with cleaning, parsing, calculating information, and performing OCR if images are provided in the {task}. "
"You can also analyze images, videos and audio using available tools such as audio_transcribe_tool and object_detection_tool when needed. You handle all math, code, and data manipulation. Use numpy, math, and available libraries. "
"For image, video, audio tasks, use pytesseract, PIL, chess, or audio_transcribe_tool and object_detection_tool as required."
),
additional_authorized_imports=[
"numpy",
"math",
"pytesseract",
"PIL",
"chess",
"bs4",
"BeautifulSoup",
"openpyxl",
"lxml",
],
)
self.manager_agent = CodeAgent(
model =self.deepseek_model,
tools=[FinalAnswerTool()],
managed_agents=[self.web_agent, self.info_agent],
name="manager_agent",
description=(
"You are the manager agent. **Respond with a single python code-block only**. "
"Inside that block you must call the other agents via `agent(name)(task)` "
"and end with `final_answer({...})`. **No natural language outside the block**"
),
additional_authorized_imports=[
"json",
"pandas",
"numpy",
],
planning_interval=6,
verbosity_level=2,
#final_answer_checks=[self.check_reasoning],
max_steps=4,
)
#def check_reasoning(self, final_answer, agent_memory):
#model = self.gemini_model
#verification_prompt = (
# f"Here is a user-given task and the agent steps: {agent_memory.get_succinct_steps()}. "
# f"The proposed final answer is: {final_answer}. "
# "Please check that the reasoning process is correct: do they correctly answer the given task? "
#"First list reasons why yes/no, then write your final decision: PASS in caps lock if it is satisfactory, FAIL if it is not."
#)
#output = model(verification_prompt)
#print("Feedback: ", output)
#if "FAIL" in output:
#raise Exception(output)
#return True
def __call__(self, task: str) -> str:
"""
Run the manager_agent on the given user task and
return its final answer text.
"""
return self.manager_agent(task)
|