uvpatel7271's picture
added code modularity
9159c06
"""Analyzer for data-science oriented Python code."""
from __future__ import annotations
from typing import Any, Dict
from schemas.response import AnalysisIssue, DomainAnalysis
def analyze_data_science_code(code: str, parsed: Dict[str, Any], complexity: Dict[str, Any]) -> DomainAnalysis:
"""Inspect pandas and numpy code for vectorization and leakage concerns."""
issues = []
suggestions = []
score = 0.72
if "iterrows(" in code or "itertuples(" in code:
issues.append(
AnalysisIssue(
title="Row-wise dataframe iteration detected",
severity="medium",
description="Looping through dataframe rows is usually slower and less scalable than vectorized operations.",
)
)
suggestions.append("Use vectorized pandas or numpy expressions instead of row-wise iteration.")
score -= 0.18
if "inplace=True" in code:
suggestions.append("Avoid inplace mutation to keep data pipelines easier to reason about and test.")
score -= 0.05
if "fit_transform(" in code and "train_test_split" not in code:
issues.append(
AnalysisIssue(
title="Potential data leakage risk",
severity="high",
description="Feature transforms appear before an explicit train/test split.",
)
)
suggestions.append("Split train and validation data before fitting stateful preprocessing steps.")
score -= 0.2
if not suggestions:
suggestions.append("Add schema assumptions and null-handling checks for production data quality.")
return DomainAnalysis(
domain="data_science",
domain_score=max(0.05, round(score, 4)),
issues=issues,
suggestions=suggestions,
highlights={
"vectorization_risk": float("iterrows(" in code or "itertuples(" in code),
"time_complexity": complexity["time_complexity"],
"uses_pandas": float(parsed.get("uses_pandas", False)),
},
)