Mettaton

Build error

App Files Files Community

DragonProgrammer commited on Nov 19, 2025

Commit

46292e7

verified ·

1 Parent(s): ce81d3a

Update app.py

Browse files

Files changed (1) hide show

app.py +157 -72

app.py CHANGED Viewed

@@ -58,78 +58,158 @@ def safe_calculator_func(expression: str) -> str:
         return f"Error calculating '{expression}': Invalid expression or calculation error ({e})."
 class SlicedLLM(LLM):
-    def __init__(self, base_llm, max_chars=2048):
-        self.base_llm = base_llm
-        self.max_chars = max_chars
-    def _call(self, prompt, **kwargs):
-        out = self.base_llm._call(prompt, **kwargs)
-        return out[-self.max_chars:]
 class LangChainAgentWrapper:
     """
-    Clean, corrected, and hardened LangChain agent wrapper for Gemma-2b-it.
-    - uses model_id consistently
-    - safe generation defaults (bfloat16, sensible max_new_tokens)
-    - compact ReAct prompt including Observation
-    - no pre-emptive llm.invoke() in __init__
-    - no ConversationBufferWindowMemory to avoid contaminating ReAct scratchpad
     """
-    def __init__(self):
         print("Initializing LangChainAgentWrapper...")
-        # NOTE: Pick the exact model id you intend to load here.
-        # Keep model_id variable consistent so you don't accidentally load a different model.
-        model_id = "google/gemma-2b-it"
         try:
-            # --- Delayed Imports (keeps startup robust) ---
             from langchain.agents import AgentExecutor, create_react_agent
             from langchain_community.tools import DuckDuckGoSearchRun
-            # --- Tokenizer & Model load ---
             print(f"Loading tokenizer for: {model_id}")
             tokenizer = AutoTokenizer.from_pretrained(model_id)
             print(f"Loading model: {model_id}")
-            # Use AutoModelForCausalLM for Gemma; bfloat16 for Ada / L40S style cards
             model = AutoModelForCausalLM.from_pretrained(
                 model_id,
                 torch_dtype=torch.bfloat16,
                 device_map="auto",
-                offload_folder="offload",  # will offload to disk when needed
             )
             print("Model loaded successfully.")
             print(f"Allocated: {memory_allocated()/1e9:.2f} GB | Reserved: {memory_reserved()/1e9:.2f} GB")
-            # --- HuggingFace pipeline with safe defaults ---
-            # return_full_text=False avoids echoing the whole prompt
-            # We set a conservative max_new_tokens suitable for ReAct loops and small models
             llm_pipeline = transformers.pipeline(
                 "text-generation",
                 model=model,
                 tokenizer=tokenizer,
-                max_new_tokens=96,            # safe default
-                return_full_text=False,       # return only the generated part
                 pad_token_id=tokenizer.eos_token_id,
                 eos_token_id=tokenizer.eos_token_id,
             )
-            print("Model pipeline created successfully.")
-            # Wrap the pipeline into the LangChain HuggingFacePipeline LLM wrapper
-            # (langchain_huggingface.HuggingFacePipeline expects a transformers pipeline)
-            self.llm = HuggingFacePipeline(pipeline=llm_pipeline)
-            print("HuggingFacePipeline wrapper created.")
-            # --- Initialize Tools ---
             print("Defining tools...")
             search_tool = DuckDuckGoSearchRun(
                 name="web_search",
-                description="A tool that performs a web search using DuckDuckGo. Use this to find up-to-date information about events, facts, or topics when the answer isn't already known."
             )
-            # Ensure these Tool.name values exactly match the strings in the prompt's action list
             self.tools = [
                 Tool(
                     name="get_current_time_in_timezone",
@@ -143,57 +223,46 @@ class LangChainAgentWrapper:
                     description=safe_calculator_func.__doc__
                 ),
             ]
-            print(f"Tools prepared for agent: {[tool.name for tool in self.tools]}")
-            # --- ReAct Prompt (compact + includes Observation) ---
-            # Important: keep this prompt short and *do not* encourage repetition.
-            # We include Observation because LangChain inserts the tool result back into the scratchpad.
             react_prompt = PromptTemplate(
                 input_variables=["tools", "tool_names", "agent_scratchpad"],
                 template="""
-            DO NOT REPEAT OR PARAPHRASE ANY PART OF THIS PROMPT.
-            You are an assistant that strictly follows the ReAct format.
-            You can use these tools:
-            {tools}
-            Valid tool names: {tool_names}
-            When responding, you MUST follow **this exact output grammar** and NEVER include anything else:
-            Thought: <reasoning>
-            Action: <one of {tool_names} OR "none">
-            Action Input: <JSON input for the tool>
-            (If Action is not "none", the system will provide an Observation.)
-            (After the Observation, you continue with another Thought/Action loop.)
-            If you choose Action: none, you MUST end with:
-            Final Answer: <your final answer>
-            Begin your reasoning now.
-            {agent_scratchpad}
-            Thought:
-            """
-            )
-            # --- Create agent & executor ---
             print("Creating agent...")
             agent = create_react_agent(self.llm, self.tools, react_prompt)
-            # NOTE: We intentionally do NOT add a ConversationBufferWindowMemory here.
-            # ReAct agents benefit from the explicit scratchpad pattern; adding memory can
-            # sometimes re-introduce context and cause token growth. If you really want memory,
-            # prefer a tiny window and test thoroughly.
             self.agent_executor = AgentExecutor(
                 agent=agent,
                 tools=self.tools,
                 verbose=True,
                 handle_parsing_errors=True,
-                max_iterations=2,   # you tuned this earlier; 1-2 is best for Gemma-2B
             )
             print("LangChain agent created successfully.")
@@ -202,6 +271,22 @@ class LangChainAgentWrapper:
             traceback.print_exc()
             raise RuntimeError(f"LangChain agent initialization failed: {e}") from e
     def __call__(self, question: str) -> str:
         """
         Run the agent on a single question. We rely on the AgentExecutor to manage

         return f"Error calculating '{expression}': Invalid expression or calculation error ({e})."
 class SlicedLLM(LLM):
+    """
+    Light wrapper around any LangChain LLM (we'll use the HuggingFacePipeline wrapper).
+    Responsibilities:
+    - Call the inner LLM
+    - Extract text robustly from different return shapes
+    - Truncate to `max_chars` from the end (keeps the most recent reasoning)
+    - Strip instruction echoing by keeping from the last 'Thought:' if present
+    """
+    def __init__(self, inner_llm, max_chars: int = 2048, **kwargs):
+        self.inner_llm = inner_llm
+        self.max_chars = int(max_chars)
+        # required for LangChain LLM subclasses
+        self.max_retries = kwargs.get("max_retries", 1)
+    @property
+    def _llm_type(self) -> str:
+        return "sliced-llm"
+    def _call(self, prompt: str, stop=None) -> str:
+        """
+        Core call entrypoint used by LangChain. We call the inner LLM and then post-process.
+        """
+        # 1) Call inner LLM (it may expose _call or be callable)
+        raw = None
+        # inner may be a LangChain LLM (with _call) or a callable pipeline
+        if hasattr(self.inner_llm, "_call"):
+            raw = self.inner_llm._call(prompt, stop=stop)
+        else:
+            # fallback - call and try to extract text
+            # Many pipeline wrappers accept a string and return text or list
+            raw = self.inner_llm(prompt)
+        # 2) Extract text from common return shapes
+        text = self._extract_text(raw)
+        # 3) Attempt to remove repeated instruction blocks by finding last 'Thought:' anchor
+        #    We keep text from the last "Thought:" onward if that appears in the output.
+        #    This removes prompt-echoed instruction blocks that often appear earlier in the string.
+        last_thought_idx = text.rfind("\nThought:")
+        if last_thought_idx >= 0:
+            # keep from the last Thought: (include the marker so parser sees it)
+            text = text[last_thought_idx + 1 :]  # +1 to keep leading newline trimmed
+        # 4) Truncate to keep the most recent reasoning / final answer
+        if len(text) > self.max_chars:
+            text = text[-self.max_chars :]
+        # 5) Strip leading/trailing whitespace
+        return text.strip()
+    def _extract_text(self, raw):
+        """
+        Handle possible return formats:
+        - plain str
+        - list/dict results from HF pipeline
+        - objects exposing .content or ['generated_text']
+        """
+        # Direct string
+        if isinstance(raw, str):
+            return raw
+        # If it's a list (transformers pipeline may return list of dicts)
+        if isinstance(raw, (list, tuple)) and len(raw) > 0:
+            first = raw[0]
+            if isinstance(first, dict):
+                # common keys: 'generated_text', 'text'
+                for k in ("generated_text", "text", "output_text"):
+                    if k in first:
+                        return str(first[k])
+                # else stringify the dict
+                return str(first)
+            else:
+                return str(first)
+        # If it's a dict with 'generated_text' etc.
+        if isinstance(raw, dict):
+            for k in ("generated_text", "text", "output_text"):
+                if k in raw:
+                    return str(raw[k])
+            # fallback to string repr
+            return str(raw)
+        # Last resort: string conversion
+        return str(raw)
+    def _identifying_params(self):
+        return {"inner": getattr(self.inner_llm, "_llm_type", None), "max_chars": self.max_chars}
+# --- Completely rewritten LangChainAgentWrapper (drop-in) ---
 class LangChainAgentWrapper:
     """
+    Rewritten, robust LangChain agent wrapper:
+    - loads Gemma model (model_id variable)
+    - wraps HF pipeline into HuggingFacePipeline (LangChain)
+    - wraps that into SlicedLLM to truncate / clean model outputs
+    - builds ReAct prompt (contains {tools} and {tool_names})
+    - creates agent with create_react_agent + AgentExecutor
     """
+    def __init__(
+        self,
+        model_id: str = "google/gemma-2b-it",
+        max_new_tokens: int = 96,
+        max_chars: int = 2048,
+        max_iterations: int = 2,
+    ):
         print("Initializing LangChainAgentWrapper...")
         try:
+            # Lazy/delayed imports
             from langchain.agents import AgentExecutor, create_react_agent
             from langchain_community.tools import DuckDuckGoSearchRun
+            # --- Tokenizer & Model ---
             print(f"Loading tokenizer for: {model_id}")
             tokenizer = AutoTokenizer.from_pretrained(model_id)
             print(f"Loading model: {model_id}")
             model = AutoModelForCausalLM.from_pretrained(
                 model_id,
                 torch_dtype=torch.bfloat16,
                 device_map="auto",
+                offload_folder="offload",
             )
             print("Model loaded successfully.")
             print(f"Allocated: {memory_allocated()/1e9:.2f} GB | Reserved: {memory_reserved()/1e9:.2f} GB")
+            # --- HF pipeline (transformers) with safe defaults ---
             llm_pipeline = transformers.pipeline(
                 "text-generation",
                 model=model,
                 tokenizer=tokenizer,
+                max_new_tokens=max_new_tokens,
+                return_full_text=False,
                 pad_token_id=tokenizer.eos_token_id,
                 eos_token_id=tokenizer.eos_token_id,
             )
+            print("Transformers pipeline created successfully.")
+            # --- Wrap pipeline into LangChain HuggingFacePipeline LLM ---
+            base_lc_llm = HuggingFacePipeline(pipeline=llm_pipeline)
+            # --- Wrap that LLM into our slicer to keep outputs trimmed and to strip instruction echoes ---
+            self.llm = SlicedLLM(base_lc_llm, max_chars=max_chars)
+            print("SlicedLLM wrapper created successfully.")
+            # --- Tools ---
             print("Defining tools...")
             search_tool = DuckDuckGoSearchRun(
                 name="web_search",
+                description="Web search via DuckDuckGo for up-to-date facts/events."
             )
             self.tools = [
                 Tool(
                     name="get_current_time_in_timezone",
                     description=safe_calculator_func.__doc__
                 ),
             ]
+            print(f"Tools prepared: {[t.name for t in self.tools]}")
+            # --- ReAct prompt (must contain {tools} and {tool_names}) ---
             react_prompt = PromptTemplate(
                 input_variables=["tools", "tool_names", "agent_scratchpad"],
                 template="""
+DO NOT REPEAT OR PARAPHRASE ANY PART OF THIS PROMPT.
+You are an assistant that strictly follows the ReAct format.
+You can use these tools:
+{tools}
+Valid tool names: {tool_names}
+When responding, follow this exact grammar and include nothing else:
+Thought: <brief reasoning>
+Action: <one of {tool_names} OR "none">
+Action Input: <input for the action>
+(If you choose an action other than "none", the system will insert an Observation before you continue.)
+If Action is "none", finish by outputting:
+Final Answer: <short direct answer>
+{agent_scratchpad}
+Thought:
+""",
+            )
+            # --- Create agent + executor ---
             print("Creating agent...")
             agent = create_react_agent(self.llm, self.tools, react_prompt)
             self.agent_executor = AgentExecutor(
                 agent=agent,
                 tools=self.tools,
                 verbose=True,
                 handle_parsing_errors=True,
+                max_iterations=max_iterations,
             )
             print("LangChain agent created successfully.")
             traceback.print_exc()
             raise RuntimeError(f"LangChain agent initialization failed: {e}") from e
+    def __call__(self, question: str) -> str:
+        """
+        Run the agent on a single question.
+        We rely on AgentExecutor to manage the ReAct loops.
+        """
+        print(f"\n--- LangChainAgentWrapper received question: {question[:140]}... ---")
+        try:
+            # AgentExecutor expects {"input": question}
+            response = self.agent_executor.invoke({"input": question})
+            return response.get("output", "No output found.")
+        except Exception as e:
+            print(f"ERROR: LangChain agent execution failed: {e}")
+            traceback.print_exc()
+            # Return an informative string so the outer code can still submit something
+            return f"Agent Error: Failed to process the question. Details: {e}"
     def __call__(self, question: str) -> str:
         """
         Run the agent on a single question. We rely on the AgentExecutor to manage