File size: 5,835 Bytes
51abc3c
 
 
 
 
 
 
 
 
 
 
 
 
 
2253aff
 
51abc3c
 
2253aff
51abc3c
7118cc4
51abc3c
391ac5c
 
 
 
56e22f3
391ac5c
2253aff
b4c715a
 
 
 
 
2253aff
7118cc4
51abc3c
 
 
 
 
 
 
623cd35
51abc3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e6f58ab
51abc3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e6f58ab
b4c715a
51abc3c
 
 
2253aff
 
51abc3c
 
 
 
 
 
 
551ab80
 
 
 
51abc3c
9ea452b
51abc3c
 
 
e6f58ab
51abc3c
 
 
 
a8a0099
 
 
51abc3c
 
 
 
 
 
a8a0099
51abc3c
a8a0099
 
51abc3c
 
a8a0099
 
 
 
 
 
 
 
 
 
 
 
 
51abc3c
c9fdc9d
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
"""High level multi-agent system powered by OpenRouter models.

This module sets up a manager agent that delegates tasks to specialized
web and information agents.  It relies on the ``smolagent`` framework and
OpenRouter API models for language generation and verification.
"""

from smolagents import (
    CodeAgent,
    VisitWebpageTool,
    WebSearchTool,
    WikipediaSearchTool,
    PythonInterpreterTool,
    FinalAnswerTool,
    OpenAIServerModel, 
    Tool,
)
from smolagents.utils import encode_image_base64, make_image_url
#from vision_tool import image_reasoning_tool
import os
HF_API_TOKEN = os.getenv("HF_API_TOKEN")

#audio_transcribe_tool = Tool.from_space(
 #   space_id = "hf-audio/whisper-large-v3",
  #  name = "audio_to_text",
   # description = "Transcribe long-form YouTube videos or audio inputs. Paste the URL to a YouTube video or upload audio file to get the transcript.",
    
#)

#object_detection_tool = Tool.from_space(
 #   space_id = "stevengrove/YOLO-World", 
  #  name = "Real-Time Open-Vocabulary Object Detector",
   # description = "Detect objects in images or videos."
#)


OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
if not OPENROUTER_API_KEY:
    raise EnvironmentError("OPENROUTER_API_KEY environment variable not set")

common = dict(
    api_base="https://openrouter.ai/api/v1",
    api_key=OPENROUTER_API_KEY,
    #extra_body={"usage": {"include": True}}
)


class MultiAgentSystem:
    """Coordinates specialized agents and their underlying models.

    The system instantiates a ``web_agent`` for browsing and data collection,
    an ``info_agent`` for computation and image reasoning, and a
    ``manager_agent`` that plans tasks and verifies answers using several
    OpenRouter models.
    """
    def __init__(self):
        self.deepseek_model = OpenAIServerModel(
            model_id="deepseek/deepseek-r1-0528:free",
            **common,
        )
        self.qwen_model = OpenAIServerModel(
            model_id="qwen/qwen-2.5-coder-32b-instruct:free",
            **common,
        )
        self.gemini_model = OpenAIServerModel(
            model_id="google/gemini-2.0-flash-exp:free",
            **common,
        )

        self.web_agent = CodeAgent(
            model =self.qwen_model,
            tools=[WebSearchTool(), VisitWebpageTool(), WikipediaSearchTool()],
            name="web_agent",
            description=(
                "You are a web browsing agent. Whenever the given {task} involves browsing "
                "the web or a specific website such as Wikipedia or YouTube, you will use "
                "the provided tools. For web-based factual and retrieval tasks, be as precise and source-reliable as possible."
            ),
            additional_authorized_imports=[
                "markdownify",
                "json",
                "requests",
                "urllib.request",
                "urllib.parse",
                "wikipedia-api",
            ],
            verbosity_level=0,
            max_steps=10,
        )

        self.info_agent = CodeAgent(
            model =self.qwen_model,
            tools=[PythonInterpreterTool()],
            name="info_agent",
            description=(
                "You are an agent tasked with cleaning, parsing, calculating information, and performing OCR if images are provided in the {task}. "
                "You can also analyze images, videos and audio using available tools such as audio_transcribe_tool and object_detection_tool when needed. You handle all math, code, and data manipulation. Use numpy, math, and available libraries. "
                "For image, video, audio tasks, use pytesseract, PIL, chess, or audio_transcribe_tool and object_detection_tool as required."
            ),
            additional_authorized_imports=[
                "numpy",
                "math",
                "pytesseract",
                "PIL",
                "chess",
                "bs4",
                "BeautifulSoup",
                "openpyxl",
                "lxml",
            ],
            
        )

        self.manager_agent = CodeAgent(
            model =self.deepseek_model,
            tools=[FinalAnswerTool()],
            managed_agents=[self.web_agent, self.info_agent],
            name="manager_agent",
            description=(
                "You are the manager agent. **Respond with a single python code-block only**. "
                "Inside that block you must call the other agents via `agent(name)(task)` "
                "and end with `final_answer({...})`. **No natural language outside the block**"
            ),
            additional_authorized_imports=[
                "json",
                "pandas",
                "numpy",
            ],
            planning_interval=6,
            verbosity_level=2,
            #final_answer_checks=[self.check_reasoning],
            max_steps=4,
        )

    #def check_reasoning(self, final_answer, agent_memory):
        #model = self.gemini_model
        #verification_prompt = (
         #   f"Here is a user-given task and the agent steps: {agent_memory.get_succinct_steps()}. "
          #  f"The proposed final answer is: {final_answer}. "
           # "Please check that the reasoning process is correct: do they correctly answer the given task? "
            #"First list reasons why yes/no, then write your final decision: PASS in caps lock if it is satisfactory, FAIL if it is not."
        #)
        #output = model(verification_prompt)
        #print("Feedback: ", output)
        #if "FAIL" in output:
            #raise Exception(output)
        #return True

    def __call__(self, task: str) -> str:
        """
        Run the manager_agent on the given user task and
        return its final answer text.
        """
        return self.manager_agent(task)