Spaces:

Jack1808
/

Claude_Code

Sleeping

Claude_Code / providers /common /heuristic_tool_parser.py

Jainish1808

Move project files to repository root for Hugging Face Space

bf177ff about 2 months ago

9.72 kB

	import re
	import uuid
	from enum import Enum
	from typing import Any

	from loguru import logger

	# Some OpenAI-compatible backends/models occasionally leak internal sentinel tokens
	# into `delta.content` (e.g. "<\|tool_call_end\|>"). These should never be shown to
	# end users, and they can disrupt downstream parsing if left in place.
	_CONTROL_TOKEN_RE = re.compile(r"<\\|[^\|>]{1,80}\\|>")
	_CONTROL_TOKEN_START = "<\|"
	_CONTROL_TOKEN_END = "\|>"


	class ParserState(Enum):
	TEXT = 1
	MATCHING_FUNCTION = 2
	PARSING_PARAMETERS = 3


	class HeuristicToolParser:
	"""
	Stateful parser that detects raw text tool calls in the format:
	● <function=Name><parameter=key>value</parameter>...

	This is used as a fallback for models that emit tool calls as text
	instead of using the structured API.
	"""

	# Class-level compiled patterns (compiled once, not per instance)
	_FUNC_START_PATTERN = re.compile(r"●\s*<function=([^>]+)>")
	_PARAM_PATTERN = re.compile(
	r"<parameter=([^>]+)>(.*?)(?:</parameter>\|$)", re.DOTALL
	)

	def __init__(self):
	self._state = ParserState.TEXT
	self._buffer = ""
	self._current_tool_id = None
	self._current_function_name = None
	self._current_parameters = {}

	def _strip_control_tokens(self, text: str) -> str:
	# Remove complete sentinel tokens. If a token is split across chunks it
	# will be removed once the buffer contains the full token.
	return _CONTROL_TOKEN_RE.sub("", text)

	def _split_incomplete_control_token_tail(self) -> str:
	"""
	If the buffer ends with an incomplete "<\|...\|>" sentinel token, keep that
	fragment in the buffer and return the safe-to-emit prefix.

	This prevents leaking raw sentinel fragments to the user when streaming.
	"""
	start = self._buffer.rfind(_CONTROL_TOKEN_START)
	if start == -1:
	return ""
	end = self._buffer.find(_CONTROL_TOKEN_END, start)
	if end != -1:
	return ""

	prefix = self._buffer[:start]
	self._buffer = self._buffer[start:]
	return prefix

	def feed(self, text: str) -> tuple[str, list[dict[str, Any]]]:
	"""
	Feed text into the parser.
	Returns a tuple of (filtered_text, detected_tool_calls).

	filtered_text: Text that should be passed through as normal message content.
	detected_tools: List of Anthropic-format tool_use blocks.
	"""
	self._buffer += text
	self._buffer = self._strip_control_tokens(self._buffer)
	detected_tools = []
	filtered_output_parts: list[str] = []

	while True:
	if self._state == ParserState.TEXT:
	# Look for the trigger character
	if "●" in self._buffer:
	idx = self._buffer.find("●")
	filtered_output_parts.append(self._buffer[:idx])
	self._buffer = self._buffer[idx:]
	self._state = ParserState.MATCHING_FUNCTION
	else:
	# Avoid emitting an incomplete "<\|...\|>" sentinel fragment if the
	# token got split across streaming chunks.
	safe_prefix = self._split_incomplete_control_token_tail()
	if safe_prefix:
	filtered_output_parts.append(safe_prefix)
	break

	filtered_output_parts.append(self._buffer)
	self._buffer = ""
	break

	if self._state == ParserState.MATCHING_FUNCTION:
	# We need enough buffer to match the function tag
	# e.g. "● <function=Grep>"
	match = self._FUNC_START_PATTERN.search(self._buffer)
	if match:
	self._current_function_name = match.group(1).strip()
	self._current_tool_id = f"toolu_heuristic_{uuid.uuid4().hex[:8]}"
	self._current_parameters = {}

	# Consume the function start from buffer
	self._buffer = self._buffer[match.end() :]
	self._state = ParserState.PARSING_PARAMETERS
	logger.debug(
	"Heuristic bypass: Detected start of tool call '{}'",
	self._current_function_name,
	)
	else:
	# If we have "●" but not the full tag yet, wait for more data
	# Unless the buffer has grown too large without a match
	if len(self._buffer) > 100:
	# Probably not a tool call, treat as text
	filtered_output_parts.append(self._buffer[0])
	self._buffer = self._buffer[1:]
	self._state = ParserState.TEXT
	else:
	break

	if self._state == ParserState.PARSING_PARAMETERS:
	# Look for parameters. We look for </parameter> to know a param is complete.
	# Or wait for another <parameter or the end of the text if it seems complete.

	# If we see a newline followed by anything other than <parameter or spaces,
	# we might be done with the tool call.

	finished_tool_call = False

	# Check if we have any complete parameters
	while True:
	param_match = self._PARAM_PATTERN.search(self._buffer)
	if param_match and "</parameter>" in param_match.group(0):
	# Detect any content before the parameter match and preserve it
	pre_match_text = self._buffer[: param_match.start()]
	if pre_match_text:
	filtered_output_parts.append(pre_match_text)

	key = param_match.group(1).strip()
	val = param_match.group(2).strip()
	self._current_parameters[key] = val
	self._buffer = self._buffer[param_match.end() :]
	else:
	break

	# Heuristic for completion:
	# 1. We have at least one param and we see a character that doesn't belong to the format
	# 2. Significant pause (not handled here, handled by caller via flush if needed)
	# 3. Another ● character (start of NEXT tool call)

	if "●" in self._buffer:
	# Next tool call starting or something else, close current
	# But first, capture any text before the ●
	idx = self._buffer.find("●")
	if idx > 0:
	filtered_output_parts.append(self._buffer[:idx])
	self._buffer = self._buffer[idx:]
	finished_tool_call = True
	elif len(self._buffer) > 0 and not self._buffer.strip().startswith("<"):
	# We have text that doesn't look like a tag, and we already parsed some or are in param state
	# Let's see if we have trailing param starts
	if "<parameter=" not in self._buffer:
	# Treat the buffer as text (it's not a parameter)
	# But wait, we are in PARSING_PARAMETERS.
	# If we have " some text", we should emit it and finish tool call.
	filtered_output_parts.append(self._buffer)
	self._buffer = ""
	finished_tool_call = True

	if finished_tool_call:
	# Emit the tool call
	detected_tools.append(
	{
	"type": "tool_use",
	"id": self._current_tool_id,
	"name": self._current_function_name,
	"input": self._current_parameters,
	}
	)
	logger.debug(
	"Heuristic bypass: Emitting tool call '{}' with {} params",
	self._current_function_name,
	len(self._current_parameters),
	)
	self._state = ParserState.TEXT
	# Continue loop to process remaining buffer (which is empty or starts with ●)
	else:
	break

	return "".join(filtered_output_parts), detected_tools

	def flush(self) -> list[dict[str, Any]]:
	"""
	Flush any remaining tool calls in the buffer.
	"""
	self._buffer = self._strip_control_tokens(self._buffer)
	detected_tools = []
	if self._state == ParserState.PARSING_PARAMETERS:
	# Try to extract any partial parameters remaining in buffer
	# Even without </parameter>
	partial_matches = re.finditer(
	r"<parameter=([^>]+)>(.*)$", self._buffer, re.DOTALL
	)
	for m in partial_matches:
	key = m.group(1).strip()
	val = m.group(2).strip()
	self._current_parameters[key] = val

	detected_tools.append(
	{
	"type": "tool_use",
	"id": self._current_tool_id,
	"name": self._current_function_name,
	"input": self._current_parameters,
	}
	)
	self._state = ParserState.TEXT
	self._buffer = ""

	return detected_tools