Spaces:

Pulastya0
/

Data-Science-Agent

Running

File size: 10,215 Bytes

"""
Agent-Specific Tool Mapping
Maps specialist agents to their relevant tools for dynamic loading.
"""

# Define tool categories and their tools
TOOL_CATEGORIES = {
    "profiling": [
        "profile_dataset",
        "detect_data_quality_issues",
        "analyze_correlations",
        "get_smart_summary",
    ],
    "cleaning": [
        "clean_missing_values",
        "handle_outliers",
        "fix_data_types",
        "force_numeric_conversion",
        "smart_type_inference",
        "remove_duplicates",
    ],
    "feature_engineering": [
        "create_time_features",
        "encode_categorical",
        "create_interaction_features",
        "create_ratio_features",
        "create_statistical_features",
        "create_log_features",
        "create_binned_features",
        "create_aggregation_features",
        "auto_feature_engineering",
    ],
    "visualization": [
        "generate_interactive_scatter",
        "generate_interactive_histogram",
        "generate_interactive_box_plots",
        "generate_interactive_correlation_heatmap",
        "generate_interactive_time_series",
        "generate_plotly_dashboard",
        "generate_eda_plots",
        "generate_combined_eda_report",
    ],
    "modeling": [
        "train_baseline_models",
        "train_with_autogluon",
        "predict_with_autogluon",
        "optimize_autogluon_model",
        "analyze_autogluon_model",
        "extend_autogluon_training",
        "train_multilabel_autogluon",
        "hyperparameter_tuning",
        "perform_cross_validation",
        "train_ensemble_models",
        "auto_ml_pipeline",
        "evaluate_model_performance",
    ],
    "time_series": [
        "detect_seasonality",
        "decompose_time_series",
        "forecast_arima",
        "forecast_prophet",
        "forecast_with_autogluon",
        "backtest_timeseries",
        "analyze_timeseries_model",
        "detect_anomalies_time_series",
    ],
    "nlp": [
        "extract_entities",
        "sentiment_analysis",
        "topic_modeling",
        "text_classification",
        "text_preprocessing",
    ],
    "computer_vision": [
        "image_classification",
        "object_detection",
        "image_preprocessing",
    ],
    "business_intelligence": [
        "calculate_kpis",
        "trend_analysis",
        "cohort_analysis",
        "churn_prediction",
    ],
    "production": [
        "export_model_to_onnx",
        "generate_inference_code",
        "create_model_documentation",
        "validate_model_drift",
    ],
    "code_execution": [
        "execute_python_code",
        "debug_code",
    ]
}

# Map specialist agents to their relevant tool categories
AGENT_TOOL_MAPPING = {
    "data_quality_agent": {
        "categories": ["profiling", "cleaning"],
        "description": "Focuses on data profiling, quality assessment, and cleaning operations"
    },
    "preprocessing_agent": {
        "categories": ["cleaning", "feature_engineering", "profiling"],
        "description": "Handles data cleaning, transformation, and feature engineering"
    },
    "visualization_agent": {
        "categories": ["visualization", "profiling"],
        "description": "Creates charts, plots, and interactive dashboards"
    },
    "modeling_agent": {
        "categories": ["modeling", "feature_engineering", "profiling"],
        "description": "Trains, tunes, and evaluates machine learning models"
    },
    "time_series_agent": {
        "categories": ["time_series", "profiling", "visualization"],
        "description": "Specializes in time series analysis and forecasting"
    },
    "nlp_agent": {
        "categories": ["nlp", "profiling", "visualization"],
        "description": "Natural language processing and text analytics"
    },
    "computer_vision_agent": {
        "categories": ["computer_vision", "profiling"],
        "description": "Image processing and computer vision tasks"
    },
    "business_intelligence_agent": {
        "categories": ["business_intelligence", "visualization", "profiling"],
        "description": "Business metrics, KPIs, and strategic insights"
    },
    "production_agent": {
        "categories": ["production", "modeling"],
        "description": "Model deployment, monitoring, and production operations"
    },
    "general_agent": {
        "categories": ["profiling", "cleaning", "visualization", "code_execution"],
        "description": "General purpose agent for exploratory analysis"
    }
}

# Core tools that should always be available regardless of agent
CORE_TOOLS = [
    "profile_dataset",
    "get_smart_summary",
    "execute_python_code",
]


def get_tools_for_agent(agent_name: str) -> list:
    """
    Get list of tool names relevant to a specific agent.
    
    Args:
        agent_name: Name of the specialist agent
        
    Returns:
        List of tool names the agent can use
    """
    if agent_name not in AGENT_TOOL_MAPPING:
        # Default to general agent tools
        agent_name = "general_agent"
    
    agent_info = AGENT_TOOL_MAPPING[agent_name]
    categories = agent_info["categories"]
    
    # Collect all tools from relevant categories
    tools = set(CORE_TOOLS)  # Start with core tools
    
    for category in categories:
        if category in TOOL_CATEGORIES:
            tools.update(TOOL_CATEGORIES[category])
    
    return list(tools)


def get_tool_categories_for_agent(agent_name: str) -> list:
    """
    Get categories of tools relevant to a specific agent.
    
    Args:
        agent_name: Name of the specialist agent
        
    Returns:
        List of tool category names
    """
    if agent_name not in AGENT_TOOL_MAPPING:
        agent_name = "general_agent"
    
    return AGENT_TOOL_MAPPING[agent_name]["categories"]


def filter_tools_by_names(all_tools: list, tool_names: list) -> list:
    """
    Filter tool definitions to only include specified tool names.
    
    Args:
        all_tools: List of all tool definitions (from TOOLS registry)
        tool_names: List of tool names to include
        
    Returns:
        Filtered list of tool definitions
    """
    filtered = []
    tool_names_set = set(tool_names)
    
    for tool in all_tools:
        if tool.get("type") == "function":
            function_name = tool.get("function", {}).get("name")
            if function_name in tool_names_set:
                # Compress description to reduce token usage
                compressed_tool = compress_tool_definition(tool)
                filtered.append(compressed_tool)
    
    return filtered


def compress_tool_definition(tool: dict) -> dict:
    """
    Compress tool definition to reduce token usage.
    
    Removes verbose examples and shortens descriptions while keeping
    essential information for the LLM to use the tool correctly.
    
    Args:
        tool: Tool definition dict
        
    Returns:
        Compressed tool definition
    """
    if tool.get("type") != "function":
        return tool
    
    compressed = {
        "type": "function",
        "function": {
            "name": tool["function"]["name"],
            "description": compress_description(tool["function"]["description"]),
            "parameters": tool["function"]["parameters"]
        }
    }
    
    # Compress parameter descriptions
    if "properties" in compressed["function"]["parameters"]:
        for param_name, param_info in compressed["function"]["parameters"]["properties"].items():
            if "description" in param_info:
                param_info["description"] = compress_description(param_info["description"])
    
    return compressed


def compress_description(description: str) -> str:
    """
    Compress a tool or parameter description.
    
    Removes examples, extra whitespace, and verbose explanations
    while keeping core functionality description.
    
    Args:
        description: Original description
        
    Returns:
        Compressed description
    """
    # Remove everything after "Example:" or "Examples:"
    if "Example:" in description:
        description = description.split("Example:")[0]
    if "Examples:" in description:
        description = description.split("Examples:")[0]
    
    # Remove extra whitespace and newlines
    description = " ".join(description.split())
    
    # Truncate if still too long (keep first 150 chars for params, 250 for tools)
    max_length = 250 if "Use this" in description else 150
    if len(description) > max_length:
        description = description[:max_length].rsplit(' ', 1)[0] + "..."
    
    return description.strip()


def get_agent_description(agent_name: str) -> str:
    """
    Get description of what an agent specializes in.
    
    Args:
        agent_name: Name of the specialist agent
        
    Returns:
        Agent description string
    """
    if agent_name in AGENT_TOOL_MAPPING:
        return AGENT_TOOL_MAPPING[agent_name]["description"]
    return "General purpose data science agent"


def suggest_next_agent(current_agent: str, completed_tools: list) -> str:
    """
    Suggest the next agent to hand off to based on completed tools.
    
    Args:
        current_agent: Current agent name
        completed_tools: List of tool names already executed
        
    Returns:
        Suggested next agent name, or None if workflow complete
    """
    # Define typical workflow progressions
    workflows = {
        "data_quality_agent": "preprocessing_agent",  # After profiling → cleaning
        "preprocessing_agent": "visualization_agent",   # After cleaning → visualize
        "visualization_agent": "modeling_agent",        # After EDA → modeling
        "modeling_agent": "production_agent",           # After training → deploy
    }
    
    # Check if current agent has completed its primary tasks
    agent_tools = set(get_tools_for_agent(current_agent))
    completed_set = set(completed_tools)
    
    # If less than 30% of agent's tools used, stay with current agent
    if len(completed_set & agent_tools) / max(len(agent_tools), 1) < 0.3:
        return current_agent
    
    # Suggest next agent in typical workflow
    return workflows.get(current_agent, None)