File size: 2,128 Bytes
c29f1fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
"""Analyzer for data-science oriented Python code."""

from __future__ import annotations

from typing import Any, Dict

from schemas.response import AnalysisIssue, DomainAnalysis


def analyze_data_science_code(code: str, parsed: Dict[str, Any], complexity: Dict[str, Any]) -> DomainAnalysis:
    """Inspect pandas and numpy code for vectorization and leakage concerns."""

    issues = []
    suggestions = []
    score = 0.72

    if "iterrows(" in code or "itertuples(" in code:
        issues.append(
            AnalysisIssue(
                title="Row-wise dataframe iteration detected",
                severity="medium",
                description="Looping through dataframe rows is usually slower and less scalable than vectorized operations.",
            )
        )
        suggestions.append("Use vectorized pandas or numpy expressions instead of row-wise iteration.")
        score -= 0.18

    if "inplace=True" in code:
        suggestions.append("Avoid inplace mutation to keep data pipelines easier to reason about and test.")
        score -= 0.05

    if "fit_transform(" in code and "train_test_split" not in code:
        issues.append(
            AnalysisIssue(
                title="Potential data leakage risk",
                severity="high",
                description="Feature transforms appear before an explicit train/test split.",
            )
        )
        suggestions.append("Split train and validation data before fitting stateful preprocessing steps.")
        score -= 0.2

    if not suggestions:
        suggestions.append("Add schema assumptions and null-handling checks for production data quality.")

    return DomainAnalysis(
        domain="data_science",
        domain_score=max(0.05, round(score, 4)),
        issues=issues,
        suggestions=suggestions,
        highlights={
            "vectorization_risk": float("iterrows(" in code or "itertuples(" in code),
            "time_complexity": complexity["time_complexity"],
            "uses_pandas": float(parsed.get("uses_pandas", False)),
        },
    )